Merge baker@bk-internal.mysql.com:/home/bk/mysql-5.1

into zim.(none):/home/brian/mysql/mysql-5.1 configure.in: Auto merged mysql-test/mysql-test-run.sh: Auto merged sql/Makefile.am: Auto merged sql/ha_innodb.cc: Auto merged sql/ha_myisam.cc: Auto merged sql/lock.cc: Auto merged storage/innobase/btr/btr0pcur.c: Auto merged storage/innobase/dict/dict0dict.c: Auto merged storage/innobase/dict/dict0load.c: Auto merged storage/innobase/fil/fil0fil.c: Auto merged storage/innobase/include/btr0pcur.h: Auto merged storage/innobase/include/btr0pcur.ic: Auto merged storage/innobase/include/dict0dict.h: Auto merged storage/innobase/include/dict0load.h: Auto merged storage/innobase/include/os0file.h: Auto merged storage/innobase/include/srv0srv.h: Auto merged storage/innobase/log/log0log.c: Auto merged storage/innobase/os/os0file.c: Auto merged storage/innobase/row/row0ins.c: Auto merged storage/innobase/row/row0mysql.c: Auto merged storage/innobase/row/row0sel.c: Auto merged storage/innobase/srv/srv0srv.c: Auto merged storage/innobase/srv/srv0start.c: Auto merged storage/myisam/mi_check.c: Auto merged storage/myisam/mi_create.c: Auto merged storage/myisam/mi_dynrec.c: Auto merged storage/myisam/mi_search.c: Auto merged storage/myisam/mi_write.c: Auto merged storage/myisam/myisamdef.h: Auto merged storage/myisam/myisampack.c: Auto merged storage/myisammrg/myrg_create.c: Auto merged storage/ndb/include/kernel/signaldata/BackupImpl.hpp: Auto merged storage/ndb/include/kernel/signaldata/BackupSignalData.hpp: Auto merged storage/ndb/include/kernel/signaldata/TuxMaint.hpp: Auto merged storage/ndb/include/mgmapi/mgmapi_config_parameters.h: Auto merged storage/ndb/include/ndbapi/NdbScanOperation.hpp: Auto merged storage/ndb/include/ndbapi/NdbTransaction.hpp: Auto merged storage/ndb/src/common/debugger/signaldata/BackupImpl.cpp: Auto merged storage/ndb/src/kernel/blocks/ERROR_codes.txt: Auto merged storage/ndb/src/kernel/blocks/backup/Backup.cpp: Auto merged storage/ndb/src/kernel/blocks/backup/Backup.hpp: Auto merged storage/ndb/src/kernel/blocks/backup/Backup.txt: Auto merged storage/ndb/src/kernel/blocks/backup/BackupInit.cpp: Auto merged storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp: Auto merged storage/ndb/src/kernel/blocks/dbacc/Dbacc.hpp: Auto merged storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp: Auto merged storage/ndb/src/kernel/main.cpp: Auto merged storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp: Auto merged storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/DbtupRoutines.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/DbtupTrigger.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/Notes.txt: Auto merged storage/ndb/src/kernel/blocks/dbtux/DbtuxNode.cpp: Auto merged storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp: Auto merged storage/ndb/src/kernel/blocks/ndbfs/OpenFiles.hpp: Auto merged storage/ndb/src/mgmapi/mgmapi.cpp: Auto merged storage/ndb/src/mgmsrv/ConfigInfo.cpp: Auto merged storage/ndb/src/mgmsrv/MgmtSrvr.cpp: Auto merged storage/ndb/src/mgmsrv/MgmtSrvr.hpp: Auto merged storage/ndb/src/mgmsrv/MgmtSrvrGeneralSignalHandling.cpp: Auto merged storage/ndb/src/ndbapi/Ndb.cpp: Auto merged storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp: Auto merged storage/ndb/src/ndbapi/NdbIndexOperation.cpp: Auto merged storage/ndb/src/ndbapi/NdbOperationDefine.cpp: Auto merged storage/ndb/src/ndbapi/NdbOperationSearch.cpp: Auto merged storage/ndb/src/ndbapi/NdbScanOperation.cpp: Auto merged storage/ndb/src/ndbapi/NdbTransaction.cpp: Auto merged storage/ndb/src/ndbapi/Ndbif.cpp: Auto merged storage/ndb/src/ndbapi/Ndblist.cpp: Auto merged storage/ndb/src/ndbapi/TransporterFacade.cpp: Auto merged storage/ndb/src/ndbapi/ndberror.c: Auto merged storage/ndb/test/ndbapi/Makefile.am: Auto merged storage/ndb/test/ndbapi/testBackup.cpp: Auto merged storage/ndb/test/ndbapi/testIndex.cpp: Auto merged storage/ndb/test/ndbapi/testNodeRestart.cpp: Auto merged storage/ndb/test/ndbapi/testOIBasic.cpp: Auto merged storage/ndb/test/ndbapi/testOperations.cpp: Auto merged storage/ndb/test/run-test/daily-basic-tests.txt: Auto merged storage/ndb/test/run-test/daily-devel-tests.txt: Auto merged storage/ndb/test/src/NdbBackup.cpp: Auto merged storage/ndb/tools/desc.cpp: Auto merged
author: unknown <brian@zim.(none)> 2005-04-26 19:07:13 -0700
committer: unknown <brian@zim.(none)> 2005-04-26 19:07:13 -0700
commit: a9da10f7a8e626aac0bd3a8d82b20c7b864c7061 (patch)
tree: 7028a9aade64c6143da00a4301627ce5d262c0f0 /storage/innobase/row
parent: a2ed27af5291f778c1fc6e23cf4edc1bc36f0bed (diff)
parent: 25311ea4a5f83652959a0744d99a4eb51aa9d328 (diff)
download: mariadb-git-a9da10f7a8e626aac0bd3a8d82b20c7b864c7061.tar.gz
12 files changed, 15993 insertions, 0 deletions
diff --git a/storage/innobase/row/Makefile.am b/storage/innobase/row/Makefile.am
new file mode 100644
index 00000000000..bd09f9a237d
--- /dev/null
+++ b/storage/innobase/row/Makefile.am
@@ -0,0 +1,25 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+# & Innobase Oy
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+include ../include/Makefile.i
+
+noinst_LIBRARIES =	librow.a
+
+librow_a_SOURCES =	row0ins.c row0mysql.c row0purge.c row0row.c row0sel.c\
+			row0uins.c row0umod.c row0undo.c row0upd.c row0vers.c
+
+EXTRA_PROGRAMS =	
diff --git a/storage/innobase/row/makefilewin b/storage/innobase/row/makefilewin
new file mode 100644
index 00000000000..c17240c6119
--- /dev/null
+++ b/storage/innobase/row/makefilewin
@@ -0,0 +1,34 @@
+include ..\include\makefile.i
+
+row.lib: row0mysql.obj row0upd.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+	lib -out:..\libs\row.lib row0mysql.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+
+row0mysql.obj: row0mysql.c
+	$(CCOM) $(CFL) -c row0mysql.c
+
+row0ins.obj: row0ins.c
+	$(CCOM) $(CFL) -c row0ins.c
+
+row0sel.obj: row0sel.c
+	$(CCOM) $(CFL) -c row0sel.c
+
+row0upd.obj: row0upd.c
+	$(CCOM) $(CFL) -c row0upd.c
+
+row0undo.obj: row0undo.c
+	$(CCOM) $(CFL) -c row0undo.c
+
+row0purge.obj: row0purge.c
+	$(CCOM) $(CFL) -c row0purge.c
+
+row0row.obj: row0row.c
+	$(CCOM) $(CFL) -c row0row.c
+
+row0vers.obj: row0vers.c
+	$(CCOM) $(CFL) -c row0vers.c
+
+row0umod.obj: row0umod.c
+	$(CCOM) $(CFL) -c row0umod.c
+
+row0uins.obj: row0uins.c
+	$(CCOM) $(CFL) -c row0uins.c
diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c
new file mode 100644
index 00000000000..303fe5749bc
--- /dev/null
+++ b/storage/innobase/row/row0ins.c
@@ -0,0 +1,2424 @@
+/******************************************************
+Insert into a table
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+
+#define	ROW_INS_PREV	1
+#define	ROW_INS_NEXT	2
+
+
+/*********************************************************************
+This prototype is copied from /mysql/sql/ha_innodb.cc.
+Invalidates the MySQL query cache for the table.
+NOTE that the exact prototype of this function has to be in
+/innobase/row/row0ins.c! */
+extern
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*	trx,		/* in: transaction which modifies the table */
+	char*	full_name,	/* in: concatenation of database name, null
+				char '\0', table name, null char'\0';
+				NOTE that in Windows this is always
+				in LOWER CASE! */
+	ulint	full_name_len);	/* in: full name length where also the null
+				chars count */
+
+/**********************************************************************
+This function returns true if 
+
+1) SQL-query in the current thread
+is either REPLACE or LOAD DATA INFILE REPLACE. 
+
+2) SQL-query in the current thread
+is INSERT ON DUPLICATE KEY UPDATE.
+
+NOTE that /mysql/innobase/row/row0ins.c must contain the 
+prototype for this function ! */
+
+ibool
+innobase_query_is_update(void);
+
+/*************************************************************************
+Creates an insert node struct. */
+
+ins_node_t*
+ins_node_create(
+/*============*/
+					/* out, own: insert node struct */
+	ulint		ins_type,	/* in: INS_VALUES, ... */
+	dict_table_t*	table, 		/* in: table where to insert */
+	mem_heap_t*	heap)		/* in: mem heap where created */
+{
+	ins_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+	node->common.type = QUE_NODE_INSERT;
+
+	node->ins_type = ins_type;
+
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->table = table;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->select = NULL;
+	
+	node->trx_id = ut_dulint_zero;
+	
+	node->entry_sys_heap = mem_heap_create(128);
+
+	node->magic_n = INS_NODE_MAGIC_N;	
+	
+	return(node);
+}
+
+/***************************************************************
+Creates an entry template for each index of a table. */
+static
+void
+ins_node_create_entry_list(
+/*=======================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	dict_index_t*	index;
+	dtuple_t*	entry;
+
+	ut_ad(node->entry_sys_heap);
+
+	UT_LIST_INIT(node->entry_list);
+
+	index = dict_table_get_first_index(node->table);
+	
+	while (index != NULL) {
+		entry = row_build_index_entry(node->row, index,
+							node->entry_sys_heap);
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+		index = dict_table_get_next_index(index);
+	}
+}
+
+/*********************************************************************
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/* in: insert node */
+{
+	dtuple_t*	row;
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+	dict_col_t*	col;
+	dfield_t*	dfield;
+	ulint		len;
+	byte*		ptr;
+
+	row = node->row;
+	table = node->table;
+	heap = node->entry_sys_heap;
+
+	ut_ad(row && table && heap);
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* 1. Allocate buffer for row id */
+
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+	
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN);
+				
+	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+	node->row_id_buf = ptr;
+
+	if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
+
+		/* 2. Fill in the dfield for mix id */
+
+		col = dict_table_get_sys_col(table, DATA_MIX_ID);
+	
+		dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+		len = mach_dulint_get_compressed_size(table->mix_id);
+		ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN);
+				
+		mach_dulint_write_compressed(ptr, table->mix_id);
+		dfield_set_data(dfield, ptr, len);
+	}
+
+	/* 3. Allocate buffer for trx id */
+
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+	
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+				
+	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+	node->trx_id_buf = ptr;
+
+	/* 4. Allocate buffer for roll ptr */
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+	
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+				
+	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*************************************************************************
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/* in: insert node */
+	dtuple_t*	row)	/* in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+			
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = ut_dulint_zero;
+}
+
+/***********************************************************************
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads. */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+				/* out: DB_SUCCESS or error code */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/* in: B-tree cursor */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	big_rec_t*	dummy_big_rec;
+	mem_heap_t*	heap;
+	upd_t*		update;
+	rec_t*		rec;
+	ulint		err;
+	
+	rec = btr_cur_get_rec(cursor);
+	
+	ut_ad((cursor->index->type & DICT_CLUSTERED) == 0);
+	ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp));
+	
+	/* We know that in the alphabetical ordering, entry and rec are
+	identified. But in their binary form there may be differences if
+	there are char fields in them. Therefore we have to calculate the
+	difference. */
+	
+	heap = mem_heap_create(1024);
+	
+	update = row_upd_build_sec_rec_difference_binary(cursor->index,
+				entry, rec, thr_get_trx(thr), heap);
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try an optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+						update, 0, thr, mtr);
+		if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
+			err = DB_FAIL;
+		}
+	} else  {
+		ut_a(mode == BTR_MODIFY_TREE);
+		err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+					&dummy_big_rec, update, 0, thr, mtr);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************************
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads. */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+				/* out: DB_SUCCESS, DB_FAIL, or error code */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/* in: B-tree cursor */
+	big_rec_t**	big_rec,/* out: possible big rec vector of fields
+				which have to be stored externally by the
+				caller */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	ulint*		ext_vec,/* in: array containing field numbers of
+				externally stored fields in entry, or NULL */
+	ulint		n_ext_vec,/* in: number of fields in ext_vec */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	mem_heap_t*	heap;
+	rec_t*		rec;
+	upd_t*		update;
+	ulint		err;
+	
+	ut_ad(cursor->index->type & DICT_CLUSTERED);
+	
+	*big_rec = NULL;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp));
+
+	heap = mem_heap_create(1024);
+	
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may NOT contain system columns trx_id or
+	roll_ptr */
+	
+	update = row_upd_build_difference_binary(cursor->index, entry, ext_vec,
+			n_ext_vec, rec, thr_get_trx(thr), heap);
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+								   mtr);
+		if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
+			err = DB_FAIL;
+		}
+	} else  {
+		ut_a(mode == BTR_MODIFY_TREE);
+		err = btr_cur_pessimistic_update(0, cursor, big_rec, update,
+								0, thr, mtr);
+	}
+	
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*************************************************************************
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table. */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+				/* out: TRUE if an ancestor updates table */
+	que_node_t*	node,	/* in: node in a query graph */
+	dict_table_t*	table)	/* in: table */
+{
+	que_node_t*	parent;
+	upd_node_t*	upd_node;
+
+	parent = que_node_get_parent(node);
+	
+	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+
+		upd_node = parent;
+
+		if (upd_node->table == table && upd_node->is_delete == FALSE) {
+
+			return(TRUE);
+		}
+
+		parent = que_node_get_parent(parent);
+
+		ut_a(parent);
+	}
+
+	return(FALSE);
+}
+	
+/*************************************************************************
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node. */
+static
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+				/* out: number of ancestors */
+	que_node_t*	node)	/* in: node in a query graph */
+{
+	que_node_t*	parent;
+	ulint		n_ancestors = 0;
+
+	parent = que_node_get_parent(node);
+	
+	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+		n_ancestors++;
+
+		parent = que_node_get_parent(parent);
+
+		ut_a(parent);
+	}
+
+	return(n_ancestors);
+}
+	
+/**********************************************************************
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update. */
+static
+ulint
+row_ins_cascade_calc_update_vec(
+/*============================*/
+					/* out: number of fields in the
+					calculated update vector; the value
+					can also be 0 if no foreign key
+					fields changed; the returned value
+					is ULINT_UNDEFINED if the column
+					type in the child table is too short
+					to fit the new value in the parent
+					table: that means the update fails */
+	upd_node_t*	node,		/* in: update node of the parent
+					table */
+	dict_foreign_t*	foreign,	/* in: foreign key constraint whose
+					type is != 0 */
+	mem_heap_t*	heap)		/* in: memory heap to use as
+					temporary storage */
+{
+	upd_node_t*	cascade		= node->cascade_node;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index		= foreign->foreign_index;
+	upd_t*		update;
+	upd_field_t*	ufield;
+	dict_table_t*	parent_table;
+	dict_index_t*	parent_index;
+	upd_t*		parent_update;
+	upd_field_t*	parent_ufield;
+	ulint		n_fields_updated;
+	ulint           parent_field_no;
+	dtype_t*	type;
+	ulint		i;
+	ulint		j;
+	    	
+	ut_a(node && foreign && cascade && table && index);
+
+	/* Calculate the appropriate update vector which will set the fields
+	in the child index record to the same value (possibly padded with 
+	spaces if the column is a fixed length CHAR or FIXBINARY column) as
+	the referenced index record will get in the update. */
+
+	parent_table = node->table;
+	ut_a(parent_table == foreign->referenced_table);
+	parent_index = foreign->referenced_index;
+	parent_update = node->update;
+		
+	update = cascade->update;
+
+	update->info_bits = 0;
+	update->n_fields = foreign->n_fields;
+		
+	n_fields_updated = 0;
+
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		parent_field_no = dict_table_get_nth_col_pos(
+					parent_table,
+					dict_index_get_nth_col_no(
+							parent_index, i));
+
+		for (j = 0; j < parent_update->n_fields; j++) {
+			parent_ufield = parent_update->fields + j;
+		
+			if (parent_ufield->field_no == parent_field_no) {
+
+				ulint	fixed_size;
+
+				/* A field in the parent index record is
+				updated. Let us make the update vector
+				field for the child table. */
+
+ 				ufield = update->fields + n_fields_updated;
+
+				ufield->field_no =
+					dict_table_get_nth_col_pos(table,
+					dict_index_get_nth_col_no(index, i));
+				ufield->exp = NULL;
+
+				ufield->new_val = parent_ufield->new_val;
+
+				type = dict_index_get_nth_type(index, i);
+
+				/* Do not allow a NOT NULL column to be
+				updated as NULL */
+
+				if (ufield->new_val.len == UNIV_SQL_NULL
+				    && (type->prtype & DATA_NOT_NULL)) {
+
+				        return(ULINT_UNDEFINED);
+				}
+
+				/* If the new value would not fit in the
+				column, do not allow the update */
+
+				if (ufield->new_val.len != UNIV_SQL_NULL
+				    && ufield->new_val.len
+				       > dtype_get_len(type)) {
+
+				        return(ULINT_UNDEFINED);
+				}
+
+				/* If the parent column type has a different
+				length than the child column type, we may
+				need to pad with spaces the new value of the
+				child column */
+
+				fixed_size = dtype_get_fixed_size(type);
+
+				/* TODO: pad in UCS-2 with 0x0020.
+				TODO: How does the special truncation of
+				UTF-8 CHAR cols affect this? */
+
+				if (fixed_size
+				    && ufield->new_val.len != UNIV_SQL_NULL
+				    && ufield->new_val.len < fixed_size) {
+
+				        ufield->new_val.data =
+						mem_heap_alloc(heap,
+								fixed_size);
+					ufield->new_val.len = fixed_size;
+					ut_a(dtype_get_pad_char(type)
+					     != ULINT_UNDEFINED);
+
+					memset(ufield->new_val.data,
+					       (byte)dtype_get_pad_char(type),
+					       fixed_size);
+					ut_memcpy(ufield->new_val.data,
+						parent_ufield->new_val.data,
+						parent_ufield->new_val.len);
+				}
+
+				ufield->extern_storage = FALSE;
+
+				n_fields_updated++;
+			}
+		}
+	}
+
+	update->n_fields = n_fields_updated;
+
+	return(n_fields_updated);
+}
+
+/*************************************************************************
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+	const char*	errstr,		/* in: error string from the viewpoint
+					of the parent table */
+	que_thr_t*	thr,		/* in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/* in: foreign key constraint */
+	rec_t*		rec,		/* in: a matching index record in the
+					child table */
+	dtuple_t*	entry)		/* in: index entry in the parent
+					table */
+{
+	FILE*	ef	= dict_foreign_err_file;
+	trx_t*	trx	= thr_get_trx(thr);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Transaction:\n", ef);
+	trx_print(ef, trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign);
+	putc('\n', ef);
+	fputs(errstr, ef);
+	fputs(" in parent table, in index ", ef);
+	ut_print_name(ef, trx, foreign->referenced_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in child table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, foreign->foreign_index->name);
+	if (rec) {
+		fputs(", there is a record:\n", ef);
+		rec_print(ef, rec, foreign->foreign_index);
+	} else {
+		fputs(", the record is not available\n", ef);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*************************************************************************
+Reports a foreign key error to dict_foreign_err_buf when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+	trx_t*		trx,		/* in: transaction */
+	dict_foreign_t*	foreign,	/* in: foreign key constraint */
+	rec_t*		rec,		/* in: a record in the parent table:
+					it does not match entry because we
+					have an error! */
+	dtuple_t*	entry)		/* in: index entry to insert in the
+					child table */
+{
+	FILE*	ef	= dict_foreign_err_file;
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Transaction:\n", ef);
+	trx_print(ef, trx);
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign);
+	fputs("\nTrying to add in child table, in index ", ef);
+	ut_print_name(ef, trx, foreign->foreign_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in parent table ", ef);
+	ut_print_name(ef, trx, foreign->referenced_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, foreign->referenced_index->name);
+	fputs(",\nthe closest match we can find is record:\n", ef);
+	if (rec && page_rec_is_supremum(rec)) {
+		/* If the cursor ended on a supremum record, it is better
+		to report the previous record in the error message, so that
+		the user gets a more descriptive error message. */
+		rec = page_rec_get_prev(rec);
+	}
+
+	if (rec) {
+		rec_print(ef, rec, foreign->foreign_index);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*************************************************************************
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+	que_thr_t*	thr,		/* in: query thread whose run_node
+					is an update node */
+	const char*	name)		/* in: table name prefixed with
+					database name and a '/' character */
+{
+	char*	buf;
+	char*	ptr;
+	ulint	len = strlen(name) + 1;
+
+	buf = mem_strdupl(name, len);
+
+	ptr = strchr(buf, '/');
+	ut_a(ptr);
+	*ptr = '\0';
+
+	/* We call a function in ha_innodb.cc */
+#ifndef UNIV_HOTBACKUP
+	innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
+#endif
+	mem_free(buf);
+}
+
+/*************************************************************************
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT. */
+static
+ulint
+row_ins_foreign_check_on_constraint(
+/*================================*/
+					/* out: DB_SUCCESS, DB_LOCK_WAIT,
+					or error code */
+	que_thr_t*	thr,		/* in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/* in: foreign key constraint whose
+					type is != 0 */
+	btr_pcur_t*	pcur,		/* in: cursor placed on a matching
+					index record in the child table */
+	dtuple_t*	entry,		/* in: index entry in the parent
+					table */
+	mtr_t*		mtr)		/* in: mtr holding the latch of pcur
+					page */
+{
+	upd_node_t*	node;
+	upd_node_t*	cascade;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	mem_heap_t*	upd_vec_heap	= NULL;
+	rec_t*		rec;
+	rec_t*		clust_rec;
+	upd_t*		update;
+	ulint		n_to_update;
+	ulint		err;
+	ulint		i;
+	trx_t*		trx;
+	mem_heap_t*	tmp_heap	= NULL;
+
+	ut_a(thr && foreign && pcur && mtr);
+
+	trx = thr_get_trx(thr);
+
+	/* Since we are going to delete or update a row, we have to invalidate
+	the MySQL query cache for table. A deadlock of threads is not possible
+	here because the caller of this function does not hold any latches with
+	the sync0sync.h rank above the kernel mutex. The query cache mutex has
+	a rank just above the kernel mutex. */
+
+	row_ins_invalidate_query_cache(thr, table->name);
+
+	node = thr->run_node;
+
+	if (node->is_delete && 0 == (foreign->type &
+			(DICT_FOREIGN_ON_DELETE_CASCADE
+			 | DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+		row_ins_foreign_report_err("Trying to delete",
+					thr, foreign,
+					btr_pcur_get_rec(pcur), entry);
+
+	        return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (!node->is_delete && 0 == (foreign->type &
+			(DICT_FOREIGN_ON_UPDATE_CASCADE
+			 | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* This is an UPDATE */
+			 
+		row_ins_foreign_report_err("Trying to update",
+					thr, foreign,
+					btr_pcur_get_rec(pcur), entry);
+
+	        return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (node->cascade_node == NULL) {
+		/* Extend our query graph by creating a child to current
+		update node. The child is used in the cascade or set null
+		operation. */
+
+		node->cascade_heap = mem_heap_create(128);
+		node->cascade_node = row_create_update_node_for_mysql(
+						table, node->cascade_heap);
+		que_node_set_parent(node->cascade_node, node);
+	}
+
+	/* Initialize cascade_node to do the operation we want. Note that we
+	use the SAME cascade node to do all foreign key operations of the
+	SQL DELETE: the table of the cascade node may change if there are
+	several child tables to the table where the delete is done! */
+
+	cascade = node->cascade_node;
+	
+	cascade->table = table;
+
+	cascade->foreign = foreign;
+	
+	if (node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+		cascade->is_delete = TRUE;
+	} else {
+		cascade->is_delete = FALSE;
+
+		if (foreign->n_fields > cascade->update_n_fields) {
+			/* We have to make the update vector longer */
+
+			cascade->update = upd_create(foreign->n_fields,
+							node->cascade_heap);
+			cascade->update_n_fields = foreign->n_fields;
+		}
+	}
+
+	/* We do not allow cyclic cascaded updating (DELETE is allowed,
+	but not UPDATE) of the same table, as this can lead to an infinite
+	cycle. Check that we are not updating the same table which is
+	already being modified in this cascade chain. We have to check
+	this also because the modification of the indexes of a 'parent'
+	table may still be incomplete, and we must avoid seeing the indexes
+	of the parent table in an inconsistent state! */
+
+	if (!cascade->is_delete
+	    && row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+	        /* We do not know if this would break foreign key
+	        constraints, but play safe and return an error */
+
+	        err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+"Trying an update, possibly causing a cyclic cascaded update\n"
+"in the child table,", thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	if (row_ins_cascade_n_ancestors(cascade) >= 15) {
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+"Trying a too deep cascaded delete or update\n",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	index = btr_pcur_get_btr_cur(pcur)->index;
+
+	ut_a(index == foreign->foreign_index);
+	
+	rec = btr_pcur_get_rec(pcur);
+
+	if (index->type & DICT_CLUSTERED) {
+		/* pcur is already positioned in the clustered index of
+		the child table */
+	
+		clust_index = index;
+		clust_rec = rec;
+	} else {
+		/* We have to look for the record in the clustered index
+		in the child table */
+
+		clust_index = dict_table_get_first_index(table);
+
+		tmp_heap = mem_heap_create(256);
+		
+		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+								tmp_heap);
+		btr_pcur_open_with_no_init(clust_index, ref,
+			PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			cascade->pcur, 0, mtr);
+
+		clust_rec = btr_pcur_get_rec(cascade->pcur);
+
+		if (!page_rec_is_user_rec(clust_rec)
+		    || btr_pcur_get_low_match(cascade->pcur)
+		       < dict_index_get_n_unique(clust_index)) {
+
+			fputs(
+			"InnoDB: error in cascade of a foreign key op\n"
+			"InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+
+			fputs("\n"
+				"InnoDB: record ", stderr);
+			rec_print(stderr, rec, index);
+			fputs("\n"
+				"InnoDB: clustered record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Set an X-lock on the row to delete or update in the child table */
+
+	err = lock_table(0, table, LOCK_IX, thr);
+
+	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+
+		err = lock_clust_rec_read_check_and_lock_alt(0, clust_rec,
+			clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr);
+	}
+	
+	if (err != DB_SUCCESS) {
+
+		goto nonstandard_exit_func;
+	}
+
+	if (rec_get_deleted_flag(clust_rec, table->comp)) {
+		/* This can happen if there is a circular reference of
+		rows such that cascading delete comes to delete a row
+		already in the process of being delete marked */
+		err = DB_SUCCESS;		
+
+		goto nonstandard_exit_func;
+	}
+
+	if ((node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL))
+	   || (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+	    	
+		/* Build the appropriate update vector which sets
+		foreign->n_fields first fields in rec to SQL NULL */
+
+		update = cascade->update;
+
+		update->info_bits = 0;
+		update->n_fields = foreign->n_fields;
+		
+		for (i = 0; i < foreign->n_fields; i++) {
+			(update->fields + i)->field_no
+				= dict_table_get_nth_col_pos(table,
+					dict_index_get_nth_col_no(index, i));
+			(update->fields + i)->exp = NULL;
+			(update->fields + i)->new_val.len = UNIV_SQL_NULL;
+			(update->fields + i)->new_val.data = NULL;
+			(update->fields + i)->extern_storage = FALSE;
+		}
+	}
+
+	if (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+		/* Build the appropriate update vector which sets changing
+		foreign->n_fields first fields in rec to new values */
+
+		upd_vec_heap = mem_heap_create(256);
+
+		n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
+							      upd_vec_heap);
+		if (n_to_update == ULINT_UNDEFINED) {
+		        err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+"Trying a cascaded update where the updated value in the child\n"
+"table would not fit in the length of the column, or the value would\n"
+"be NULL and the column is declared as not NULL in the child table,",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		       goto nonstandard_exit_func;
+		}
+
+		if (cascade->update->n_fields == 0) {
+
+			/* The update does not change any columns referred
+			to in this foreign key constraint: no need to do
+			anything */
+
+			err = DB_SUCCESS;		
+
+			goto nonstandard_exit_func;			
+		}
+	}
+	
+	/* Store pcur position and initialize or store the cascade node
+	pcur stored position */
+	
+	btr_pcur_store_position(pcur, mtr);
+	
+	if (index == clust_index) {
+		btr_pcur_copy_stored_position(cascade->pcur, pcur);
+	} else {
+		btr_pcur_store_position(cascade->pcur, mtr);
+	}
+		
+	mtr_commit(mtr);
+
+	ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+	
+	err = row_update_cascade_for_mysql(thr, cascade,
+						foreign->foreign_table);
+
+	if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
+		fprintf(stderr,
+"InnoDB: error: table %s has the counter 0 though there is\n"
+"InnoDB: a FOREIGN KEY check running on it.\n",
+			foreign->foreign_table->name);
+	}
+
+	/* Release the data dictionary latch for a while, so that we do not
+	starve other threads from doing CREATE TABLE etc. if we have a huge
+	cascaded operation running. The counter n_foreign_key_checks_running
+	will prevent other users from dropping or ALTERing the table when we
+	release the latch. */
+
+	row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+	row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+	mtr_start(mtr);
+
+	/* Restore pcur position */
+	
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+	        mem_heap_free(upd_vec_heap);
+	}
+
+	return(err);
+
+nonstandard_exit_func:
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+	        mem_heap_free(upd_vec_heap);
+	}
+
+	btr_pcur_store_position(pcur, mtr);
+
+	mtr_commit(mtr);
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	return(err);
+}
+
+/*************************************************************************
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints. */
+static
+ulint
+row_ins_set_shared_rec_lock(
+/*========================*/
+				/* out: DB_SUCCESS or error code */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP type lock */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: index */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
+	que_thr_t*	thr)	/* in: query thread */	
+{
+	ulint	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (index->type & DICT_CLUSTERED) {
+		err = lock_clust_rec_read_check_and_lock(0,
+				rec, index, offsets, LOCK_S, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(0,
+				rec, index, offsets, LOCK_S, type, thr);
+	}
+
+	return(err);
+}
+
+/*************************************************************************
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records */
+static
+ulint
+row_ins_set_exclusive_rec_lock(
+/*============================*/
+				/* out: DB_SUCCESS or error code */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP type lock */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: index */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
+	que_thr_t*	thr)	/* in: query thread */	
+{
+	ulint	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (index->type & DICT_CLUSTERED) {
+		err = lock_clust_rec_read_check_and_lock(0,
+				rec, index, offsets, LOCK_X, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(0,
+				rec, index, offsets, LOCK_X, type, thr);
+	}
+
+	return(err);
+}
+	
+/*******************************************************************
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_operation_lock. */
+
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+				/* out: DB_SUCCESS,
+				DB_NO_REFERENCED_ROW,
+				or DB_ROW_IS_REFERENCED */
+	ibool		check_ref,/* in: TRUE if we want to check that
+				the referenced table is ok, FALSE if we
+				want to to check the foreign key table */
+	dict_foreign_t*	foreign,/* in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/* in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/* in: index entry for index */
+	que_thr_t*	thr)	/* in: query thread */
+{
+  	upd_node_t*  	upd_node;
+	dict_table_t*	check_table;
+	dict_index_t*	check_index;
+	ulint		n_fields_cmp;
+	rec_t*		rec;
+	btr_pcur_t	pcur;
+	ibool		moved;
+	int		cmp;
+	ulint		err;
+	ulint		i;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+run_again:
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	err = DB_SUCCESS;
+
+	if (trx->check_foreigns == FALSE) {
+		/* The user has suppressed foreign key checks currently for
+		this session */
+		goto exit_func;
+	}
+
+	/* If any of the foreign key fields in entry is SQL NULL, we
+	suppress the foreign key check: this is compatible with Oracle,
+	for example */
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+                                         dtuple_get_nth_field(entry, i))) {
+
+			goto exit_func;
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+	        upd_node = thr->run_node;
+
+	        if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
+		        /* If a cascaded update is done as defined by a 
+			foreign key constraint, do not check that
+			constraint for the child row. In ON UPDATE CASCADE
+			the update of the parent row is only half done when
+			we come here: if we would check the constraint here
+			for the child row it would fail.
+
+			A QUESTION remains: if in the child table there are
+			several constraints which refer to the same parent
+			table, we should merge all updates to the child as
+			one update? And the updates can be contradictory!
+			Currently we just perform the update associated
+			with each foreign key constraint, one after
+			another, and the user has problems predicting in
+			which order they are performed. */
+
+			goto exit_func;
+		}
+	}
+
+	if (check_ref) {
+		check_table = foreign->referenced_table;
+		check_index = foreign->referenced_index;
+	} else {
+		check_table = foreign->foreign_table;
+		check_index = foreign->foreign_index;
+	}
+
+	if (check_table == NULL || check_table->ibd_file_missing) {
+		if (check_ref) {
+			FILE*	ef = dict_foreign_err_file;
+			mutex_enter(&dict_foreign_err_mutex);
+			rewind(ef);
+			ut_print_timestamp(ef);
+			fputs(" Transaction:\n", ef);
+			trx_print(ef, trx);
+			fputs("Foreign key constraint fails for table ", ef);
+			ut_print_name(ef, trx, foreign->foreign_table_name);
+			fputs(":\n", ef);
+			dict_print_info_on_foreign_key_in_create_format(ef,
+					trx, foreign);
+			fputs("\nTrying to add to index ", ef);
+			ut_print_name(ef, trx, foreign->foreign_index->name);
+			fputs(" tuple:\n", ef);
+			dtuple_print(ef, entry);
+			fputs("\nBut the parent table ", ef);
+			ut_print_name(ef, trx, foreign->referenced_table_name);
+		fputs("\nor its .ibd file does not currently exist!\n", ef);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			err = DB_NO_REFERENCED_ROW;
+		}
+
+		goto exit_func;
+	}
+
+	ut_a(check_table && check_index);
+
+	if (check_table != table) {
+		/* We already have a LOCK_IX on table, but not necessarily
+		on check_table */
+		
+		err = lock_table(0, check_table, LOCK_IS, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto do_possible_lock_wait;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+					BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	/* Scan index records and check if there is a matching record */
+
+	for (;;) {
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+			goto next_rec;
+		}
+		
+		offsets = rec_get_offsets(rec, check_index,
+					offsets, ULINT_UNDEFINED, &heap);
+
+		if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec,
+						check_index, offsets, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
+			goto next_rec;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (rec_get_deleted_flag(rec,
+						rec_offs_comp(offsets))) {
+				err = row_ins_set_shared_rec_lock(
+						LOCK_ORDINARY, rec,
+						check_index, offsets, thr);
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+			} else {
+				/* Found a matching record. Lock only
+				a record because we can allow inserts
+				into gaps */
+				
+				err = row_ins_set_shared_rec_lock(
+						LOCK_REC_NOT_GAP, rec,
+						check_index, offsets, thr);
+
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+
+				if (check_ref) {			
+					err = DB_SUCCESS;
+
+					break;
+				} else if (foreign->type != 0) {
+					/* There is an ON UPDATE or ON DELETE
+					condition: check them in a separate
+					function */
+
+					err =
+					  row_ins_foreign_check_on_constraint(
+						thr, foreign, &pcur, entry,
+									&mtr);
+					if (err != DB_SUCCESS) {
+
+						break;
+					}
+				} else {
+					row_ins_foreign_report_err(
+						"Trying to delete or update",
+						thr, foreign, rec, entry);
+
+					err = DB_ROW_IS_REFERENCED;
+					break;
+				}
+			}
+		}
+
+		if (cmp < 0) {
+			err = row_ins_set_shared_rec_lock(LOCK_GAP,
+					rec, check_index, offsets, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
+			if (check_ref) {			
+				err = DB_NO_REFERENCED_ROW;
+				row_ins_foreign_report_add_err(
+					trx, foreign, rec, entry);
+			} else {
+				err = DB_SUCCESS;
+			}
+
+			break;
+		}
+
+		ut_a(cmp == 0);
+next_rec:
+		moved = btr_pcur_move_to_next(&pcur, &mtr);
+
+		if (!moved) {
+			if (check_ref) {			
+				rec = btr_pcur_get_rec(&pcur);
+				row_ins_foreign_report_add_err(
+					trx, foreign, rec, entry);
+				err = DB_NO_REFERENCED_ROW;
+			} else {
+				err = DB_SUCCESS;
+			}
+
+			break;
+		}
+	}
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		trx->error_state = err;
+
+		que_thr_stop_for_mysql(thr);
+
+		srv_suspend_mysql_thread(thr);
+	
+		if (trx->error_state == DB_SUCCESS) {
+
+		        goto run_again;
+		}
+
+		err = trx->error_state;
+	}
+
+exit_func:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*******************************************************************
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint. */
+static
+ulint
+row_ins_check_foreign_constraints(
+/*==============================*/
+				/* out: DB_SUCCESS or error code */
+	dict_table_t*	table,	/* in: table */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry for index */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_foreign_t*	foreign;
+	ulint		err;
+	trx_t*		trx;
+	ibool		got_s_lock	= FALSE;
+
+	trx = thr_get_trx(thr);
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign) {
+		if (foreign->foreign_index == index) {
+
+			if (foreign->referenced_table == NULL) {
+				dict_table_get(foreign->referenced_table_name,
+									trx);
+			}
+
+			if (0 == trx->dict_operation_lock_mode) {
+				got_s_lock = TRUE;
+
+				row_mysql_freeze_data_dictionary(trx);
+			}
+
+			if (foreign->referenced_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				(foreign->referenced_table
+					->n_foreign_key_checks_running)++;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects the referenced
+			table from being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(TRUE, foreign,
+						table, entry, thr);
+
+			if (foreign->referenced_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				ut_a(foreign->referenced_table
+					->n_foreign_key_checks_running > 0);
+				(foreign->referenced_table
+					->n_foreign_key_checks_running)--;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			if (got_s_lock) {
+				row_mysql_unfreeze_data_dictionary(trx);
+			}
+				
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+				/* out: TRUE if error */
+	rec_t*		rec,	/* in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	dtuple_t*	entry,	/* in: entry to insert */
+	dict_index_t*	index,	/* in: index */
+	const ulint*	offsets)/* in: rec_get_offsets(rec, index) */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	ulint   i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, offsets,
+					&matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+	        return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!(index->type & DICT_CLUSTERED)) {
+
+	        for (i = 0; i < n_unique; i++) {
+	                if (UNIV_SQL_NULL == dfield_get_len(
+                                         dtuple_get_nth_field(entry, i))) {
+
+	                        return(FALSE);
+	                }
+	        }
+	}
+
+	if (!rec_get_deleted_flag(rec, index->table->comp)) {
+
+	        return(TRUE);
+	}
+
+	return(FALSE);
+}	
+
+/*******************************************************************
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records. */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+				/* out: DB_SUCCESS, DB_DUPLICATE_KEY, or
+				DB_LOCK_WAIT */
+	dict_index_t*	index,	/* in: non-clustered unique index */
+	dtuple_t*	entry,	/* in: index entry */
+	que_thr_t*	thr)	/* in: query thread */
+{
+#ifndef UNIV_HOTBACKUP
+	ulint		n_unique;
+	ulint		i;
+	int		cmp;
+	ulint		n_fields_cmp;
+	rec_t*		rec;
+	btr_pcur_t	pcur;
+	ulint		err		= DB_SUCCESS;
+	ibool		moved;
+	mtr_t		mtr;
+	trx_t*		trx;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	n_unique = dict_index_get_n_unique(index);
+
+	/* If the secondary index is unique, but one of the fields in the
+	n_unique first fields is NULL, a unique key violation cannot occur,
+	since we define NULL != NULL in this case */
+
+	for (i = 0; i < n_unique; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+                                         dtuple_get_nth_field(entry, i))) {
+
+			return(DB_SUCCESS);
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+	
+	btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	/* Scan index records and check if there is a duplicate */
+
+	for (;;) {
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+			goto next_rec;
+		}
+				
+		/* Try to place a lock on the index record */
+
+		trx = thr_get_trx(thr);      
+		ut_ad(trx);
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					ULINT_UNDEFINED, &heap);
+
+		if (innobase_query_is_update()) {
+
+			/* If the SQL-query will update or replace
+			duplicate key we will take X-lock for 
+			duplicates ( REPLACE, LOAD DATAFILE REPLACE, 
+			INSERT ON DUPLICATE KEY UPDATE). */
+			
+			err = row_ins_set_exclusive_rec_lock(LOCK_ORDINARY,
+						rec, index, offsets, thr);
+		} else {
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY,
+						rec, index, offsets, thr);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			break;
+		}
+
+		if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+		
+			goto next_rec;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							index, offsets)) {
+				err = DB_DUPLICATE_KEY;
+
+				thr_get_trx(thr)->error_info = index;
+
+				break;
+			}
+		}
+
+		if (cmp < 0) {
+			break;
+		}
+
+		ut_a(cmp == 0);
+next_rec:
+		moved = btr_pcur_move_to_next(&pcur, &mtr);
+
+		if (!moved) {
+			break;
+		}
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	return(err);
+#else /* UNIV_HOTBACKUP */
+	/* This function depends on MySQL code that is not included in
+	InnoDB Hot Backup builds.  Besides, this function should never
+	be called in InnoDB Hot Backup. */
+	ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/*******************************************************************
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index! */
+static
+ulint
+row_ins_duplicate_error_in_clust(
+/*=============================*/
+				/* out: DB_SUCCESS if no error,
+				DB_DUPLICATE_KEY if error, DB_LOCK_WAIT if we
+				have to wait for a lock on a possible
+				duplicate record */
+	btr_cur_t*	cursor,	/* in: B-tree cursor */
+	dtuple_t*	entry,	/* in: entry to insert */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr */
+{
+#ifndef UNIV_HOTBACKUP
+	ulint	err;
+	rec_t*	rec;
+	page_t*	page;
+	ulint	n_unique;
+	trx_t*	trx		= thr_get_trx(thr);
+	mem_heap_t*heap		= NULL;
+	ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*	offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	UT_NOT_USED(mtr);
+	
+	ut_a(cursor->index->type & DICT_CLUSTERED);
+	ut_ad(cursor->index->type & DICT_UNIQUE);
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+	
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index);
+	
+	if (cursor->low_match >= n_unique) {
+		
+		rec = btr_cur_get_rec(cursor);
+		page = buf_frame_align(rec);
+
+		if (rec != page_get_infimum_rec(page)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+			/* We set a lock on the possible duplicate: this
+			is needed in logical logging of MySQL to make
+			sure that in roll-forward we get the same duplicate
+			errors as in original execution */
+
+			if (innobase_query_is_update()) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for 
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE, 
+				INSERT ON DUPLICATE KEY UPDATE). */
+				
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,rec,cursor->index,
+					offsets, thr);
+			} else {
+				
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,rec, cursor->index, 
+					offsets, thr);
+			} 
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(rec, entry,
+						cursor->index, offsets)) {
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+		}
+	}
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+		page = buf_frame_align(rec);
+
+		if (rec != page_get_supremum_rec(page)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+			if (innobase_query_is_update()) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for 
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE, 
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+						LOCK_REC_NOT_GAP, rec,
+						cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+						LOCK_REC_NOT_GAP, rec,
+						cursor->index, offsets, thr);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(rec, entry,
+						cursor->index, offsets)) {
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+			mem_heap_free(heap);
+		}
+
+		ut_a(!(cursor->index->type & DICT_CLUSTERED));
+						/* This should never happen */
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	return(err);
+#else /* UNIV_HOTBACKUP */
+	/* This function depends on MySQL code that is not included in
+	InnoDB Hot Backup builds.  Besides, this function should never
+	be called in InnoDB Hot Backup. */
+	ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/*******************************************************************
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal. */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+				/* out: 0 if no update, ROW_INS_PREV if
+				previous should be updated; currently we
+				do the search so that only the low_match
+				record can match enough to the search tuple,
+				not the next record */
+	btr_cur_t*	cursor)	/* in: B-tree cursor */
+{
+	ulint	enough_match;
+	rec_t*	rec;
+	page_t*	page;
+	
+	/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+	pointers on upper levels of the B-tree may match more to entry than
+	to actual user records on the leaf level, we have to check if the
+	candidate record is actually a user record. In a clustered index
+	node pointers contain index->n_unique first fields, and in the case
+	of a secondary index, all fields of the index. */
+
+	enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+	
+	if (cursor->low_match >= enough_match) {
+
+		rec = btr_cur_get_rec(cursor);
+		page = buf_frame_align(rec);
+
+		if (rec != page_get_infimum_rec(page)) {
+
+			return(ROW_INS_PREV);
+		}
+	}
+
+	return(0);
+}
+
+/*******************************************************************
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index. */
+
+ulint
+row_ins_index_entry_low(
+/*====================*/
+				/* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL
+				if pessimistic retry needed, or error code */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	ulint*		ext_vec,/* in: array containing field numbers of
+				externally stored fields in entry, or NULL */
+	ulint		n_ext_vec,/* in: number of fields in ext_vec */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	btr_cur_t	cursor;
+	ulint		ignore_sec_unique	= 0;
+	ulint		modify = 0; /* remove warning */
+	rec_t*		insert_rec;
+	rec_t*		rec;
+	rec_t*		first_rec;
+	ulint		err;
+	ulint		n_unique;
+	big_rec_t*	big_rec			= NULL;
+	mtr_t		mtr;
+	mem_heap_t*	heap			= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets			= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	log_free_check();
+
+	mtr_start(&mtr);
+
+	cursor.thr = thr;
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+	
+	if (!(thr_get_trx(thr)->check_unique_secondary)) {
+		ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE;
+	}
+
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				mode | BTR_INSERT | ignore_sec_unique,
+				&cursor, 0, &mtr);
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		/* The insertion was made to the insert buffer already during
+		the search: we are done */
+
+		err = DB_SUCCESS;
+
+		goto function_exit;
+	}	
+					
+	first_rec = page_rec_get_next(page_get_infimum_rec(
+			buf_frame_align(btr_cur_get_rec(&cursor))));
+
+	if (!page_rec_is_supremum(first_rec)) {
+		ut_a(rec_get_n_fields(first_rec, index)
+			== dtuple_get_n_fields(entry));
+	}
+
+	n_unique = dict_index_get_n_unique(index);
+
+	if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique
+					 || cursor.low_match >= n_unique)) {
+
+		if (index->type & DICT_CLUSTERED) {			 
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(&cursor,
+							entry, thr, &mtr);
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+		} else {
+			mtr_commit(&mtr);
+			err = row_ins_scan_sec_index_for_duplicate(index,
+								entry, thr);
+			mtr_start(&mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			/* We did not find a duplicate and we have now
+			locked with s-locks the necessary records to
+			prevent any insertion of a duplicate by another
+			transaction. Let us now reposition the cursor and
+			continue the insertion. */
+			
+			btr_cur_search_to_nth_level(index, 0, entry,
+					PAGE_CUR_LE, mode | BTR_INSERT,
+					&cursor, 0, &mtr);
+		}		
+	}
+
+	modify = row_ins_must_modify(&cursor);
+
+	if (modify != 0) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+
+		if (modify == ROW_INS_NEXT) {
+			rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+			btr_cur_position(index, rec, &cursor);
+		}
+
+		if (index->type & DICT_CLUSTERED) {
+			err = row_ins_clust_index_entry_by_modify(mode,
+							&cursor, &big_rec,
+							entry,
+							ext_vec, n_ext_vec,
+							thr, &mtr);
+		} else {
+			err = row_ins_sec_index_entry_by_modify(mode, &cursor,
+								entry,
+								thr, &mtr);
+		}
+		
+	} else {
+		if (mode == BTR_MODIFY_LEAF) {
+			err = btr_cur_optimistic_insert(0, &cursor, entry,
+					&insert_rec, &big_rec, thr, &mtr);
+		} else {
+			ut_a(mode == BTR_MODIFY_TREE);
+			err = btr_cur_pessimistic_insert(0, &cursor, entry,
+					&insert_rec, &big_rec, thr, &mtr);
+		}
+
+		if (err == DB_SUCCESS) {
+			if (ext_vec) {
+				rec_set_field_extern_bits(insert_rec, index,
+						ext_vec, n_ext_vec, &mtr);
+			}
+		}
+	}
+
+function_exit:
+	mtr_commit(&mtr);
+
+	if (big_rec) {
+		rec_t*		rec;
+		mtr_start(&mtr);
+	
+		btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+					BTR_MODIFY_TREE, &cursor, 0, &mtr);
+		rec = btr_cur_get_rec(&cursor);
+		offsets = rec_get_offsets(rec, index, offsets,
+					ULINT_UNDEFINED, &heap);
+
+		err = btr_store_big_rec_extern_fields(index, rec,
+						offsets, big_rec, &mtr);
+
+		if (modify) {
+			dtuple_big_rec_free(big_rec);
+		} else {
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*******************************************************************
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record. */
+
+ulint
+row_ins_index_entry(
+/*================*/
+				/* out: DB_SUCCESS, DB_LOCK_WAIT,
+				DB_DUPLICATE_KEY, or some other error code */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	ulint*		ext_vec,/* in: array containing field numbers of
+				externally stored fields in entry, or NULL */
+	ulint		n_ext_vec,/* in: number of fields in ext_vec */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+
+	if (UT_LIST_GET_FIRST(index->table->foreign_list)) {
+		err = row_ins_check_foreign_constraints(index->table, index,
+								entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
+						ext_vec, n_ext_vec, thr);
+	if (err != DB_FAIL) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
+						ext_vec, n_ext_vec, thr);
+	return(err);
+}
+
+/***************************************************************
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+static
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to make */
+	dtuple_t*	row)	/* in: row */
+{
+	dict_field_t*	ind_field;
+	dfield_t*	field;
+	dfield_t*	row_field;
+	ulint		n_fields;
+	ulint		i;
+	dtype_t*        cur_type;
+
+	ut_ad(entry && row);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(entry, i);
+		ind_field = dict_index_get_nth_field(index, i);
+
+		row_field = dtuple_get_nth_field(row, ind_field->col->ind);
+
+		/* Check column prefix indexes */
+		if (ind_field->prefix_len > 0
+		    && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(ind_field));
+
+			field->len = dtype_get_at_most_n_mbchars(cur_type,
+				  ind_field->prefix_len,
+				  dfield_get_len(row_field), row_field->data);
+		} else {
+		        field->len = row_field->len;
+		}
+
+		field->data = row_field->data;
+	}
+}
+
+/***************************************************************
+Inserts a single index entry to the table. */
+static
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	ins_node_t*	node,	/* in: row insert node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+
+	ut_ad(dtuple_check_typed(node->row));
+	
+	row_ins_index_entry_set_vals(node->index, node->entry, node->row);
+	
+	ut_ad(dtuple_check_typed(node->entry));
+
+	err = row_ins_index_entry(node->index, node->entry, NULL, 0, thr);
+
+	return(err);
+}
+
+/***************************************************************
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	dulint	row_id;
+	
+	ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+	
+	if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) {
+
+		/* No row id is stored if the clustered index is unique */
+
+		return;
+	}
+	
+	/* Fill in row id value to row */
+
+	row_id = dict_sys_get_new_row_id();
+
+	dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***************************************************************
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+	
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row; 
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***************************************************************
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row; 
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+	
+/***************************************************************
+Inserts a row to a table. */
+
+ulint
+row_ins(
+/*====*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	ins_node_t*	node,	/* in: row insert node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+	
+	ut_ad(node && thr);
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+	
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (node->index != NULL) {
+		err = row_ins_index_entry_step(node, thr);
+		
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+	}
+
+	ut_ad(node->entry == NULL);
+	
+	node->state = INS_NODE_ALLOC_ROW_ID;
+	
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_ins_step(
+/*=========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	ulint		err;
+
+	ut_ad(thr);
+	
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started(trx);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. */
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+		
+		if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+			/* No need to do IX-locking or write trx id to buf */
+
+			goto same_trx;
+		}	
+
+		trx_write_trx_id(node->trx_id_buf, trx->id);
+
+		err = lock_table(0, node->table, LOCK_IX, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+	same_trx:				
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+ 		
+			/* Fetch a row to insert */
+		
+			thr->run_node = sel_node;
+	
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+				&& (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+	
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* err == DB_LOCK_WAIT or SQL error detected */
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+		
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
new file mode 100644
index 00000000000..7f78a5b723b
--- /dev/null
+++ b/storage/innobase/row/row0mysql.c
@@ -0,0 +1,4085 @@
+/******************************************************
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+(c) 2000 Innobase Oy
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "log0log.h"
+#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
+
+/* A dummy variable used to fool the compiler */
+ibool	row_mysql_identically_false	= FALSE;
+
+/* List of tables we should drop in background. ALTER TABLE in MySQL requires
+that the table handler can drop the table in background when there are no
+queries to it any more. Protected by the kernel mutex. */
+typedef struct row_mysql_drop_struct	row_mysql_drop_t;
+struct row_mysql_drop_struct{
+	char*				table_name;
+	UT_LIST_NODE_T(row_mysql_drop_t) row_mysql_drop_list;
+};
+
+UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
+ibool	row_mysql_drop_list_inited 	= FALSE;
+
+/* Magic table names for invoking various monitor threads */
+static const char S_innodb_monitor[] = "innodb_monitor";
+static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
+static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
+static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+
+/* Name suffix for recovered orphaned temporary tables */
+static const char S_recover_innodb_tmp_table[] = "_recover_innodb_tmp_table";
+/***********************************************************************
+Determine if the given name ends in the suffix reserved for recovered
+orphaned temporary tables. */
+static
+ibool
+row_mysql_is_recovered_tmp_table(
+/*=============================*/
+				 /* out: TRUE if table name ends in
+				 the reserved suffix */
+	const char*	name)
+{
+	ulint	namelen	= strlen(name) + 1;
+	return(namelen >= sizeof S_recover_innodb_tmp_table
+		&& !memcmp(name + namelen -
+			sizeof S_recover_innodb_tmp_table,
+			S_recover_innodb_tmp_table,
+			sizeof S_recover_innodb_tmp_table));
+}
+
+/***********************************************************************
+Determine if the given name is a name reserved for MySQL system tables. */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+				 /* out: TRUE if name is a MySQL
+				 system table name */
+	const char*	name)
+{
+	if (memcmp(name, "mysql/", 6)) {
+		return(FALSE);
+	}
+	return(0 == strcmp(name + 6, "host")
+	    || 0 == strcmp(name + 6, "user")
+	    || 0 == strcmp(name + 6, "db"));
+}
+
+/***********************************************************************
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+	if (srv_dml_needed_delay) {
+		os_thread_sleep(srv_dml_needed_delay);
+	}
+}
+
+/***********************************************************************
+Frees the blob heap in prebuilt when no longer needed. */
+
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	mem_heap_free(prebuilt->blob_heap);
+	prebuilt->blob_heap = NULL;
+}
+
+/***********************************************************************
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format. */
+
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+			/* out: pointer to the data, we skip the 1 or 2 bytes
+			at the start that are used to store the len */
+	byte*	dest,	/* in: where to store */
+	ulint	len,	/* in: length, must fit in two bytes */
+	ulint	lenlen)	/* in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		ut_a(len < 256 * 256);
+
+		mach_write_to_2_little_endian(dest, len);
+
+		return(dest + 2);
+	}
+
+	ut_a(lenlen == 1);
+	ut_a(len < 256);
+
+	mach_write_to_1(dest, len);
+
+	return(dest + 1);
+}
+
+/***********************************************************************
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data. */
+
+byte*
+row_mysql_read_true_varchar(
+/*========================*/
+			/* out: pointer to the data, we skip the 1 or 2 bytes
+			at the start that are used to store the len */
+	ulint*	len,	/* out: variable-length field length */
+	byte*	field,	/* in: field in the MySQL format */
+	ulint	lenlen)	/* in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		*len = mach_read_from_2_little_endian(field);
+
+		return(field + 2);
+	}
+
+	ut_a(lenlen == 1);
+
+	*len = mach_read_from_1(field);
+
+	return(field + 1);
+}
+
+/***********************************************************************
+Stores a reference to a BLOB in the MySQL format. */
+
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*	dest,		/* in: where to store */
+	ulint	col_len,	/* in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	byte*	data,		/* in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint	len)		/* in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+
+	memset(dest, '\0', col_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_a(col_len - 8 > 1 || len < 256);
+	ut_a(col_len - 8 > 2 || len < 256 * 256);
+	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*));	
+}
+
+/***********************************************************************
+Reads a reference to a BLOB in the MySQL format. */
+
+byte*
+row_mysql_read_blob_ref(
+/*====================*/
+				/* out: pointer to BLOB data */
+	ulint*	len,		/* out: BLOB length */
+	byte*	ref,		/* in: BLOB reference in the MySQL format */
+	ulint	col_len)	/* in: BLOB reference length (not BLOB
+				length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	ut_memcpy((byte*)&data, ref + col_len - 8, sizeof(byte*));
+
+	return(data);
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c. */
+
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+					/* out: up to which byte we used
+					buf in the conversion */
+	dfield_t*	dfield,		/* in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/* in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! */
+	ibool		row_format_col,	/* TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	byte*		mysql_data,	/* in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/* in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ibool		comp)		/* in: TRUE = compact format */
+{
+	byte*		ptr 	= mysql_data;
+	dtype_t*	dtype;
+	ulint		type;
+	ulint		lenlen;
+
+	dtype = dfield_get_type(dfield);
+
+	type = dtype->mtype;
+
+	if (type == DATA_INT) {
+		/* Store integer data in Innobase in a big-endian format,
+		sign bit negated if the data is a signed integer. In MySQL,
+		integers are stored in a little-endian format. */
+
+		ptr = buf + col_len;
+
+		for (;;) {
+			ptr--;
+			*ptr = *mysql_data;
+			if (ptr == buf) {
+				break;
+			}
+			mysql_data++;
+		}
+
+		if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+			*ptr = (byte) (*ptr ^ 128);
+		}
+
+		buf += col_len;
+	} else if ((type == DATA_VARCHAR
+		    || type == DATA_VARMYSQL
+		    || type == DATA_BINARY)) {
+
+		if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+			/* The length of the actual data is stored to 1 or 2
+			bytes at the start of the field */
+			
+			if (row_format_col) {
+				if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+					lenlen = 2;
+				} else {
+					lenlen = 1;
+				}
+			} else {
+				/* In a MySQL key value, lenlen is always 2 */
+				lenlen = 2;
+			}
+
+			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+								      lenlen);
+		} else {
+			/* Remove trailing spaces from old style VARCHAR
+			columns. */
+
+			/* Handle UCS2 strings differently. */
+			ulint	mbminlen	= dtype_get_mbminlen(dtype);
+
+			ptr = mysql_data;
+
+			if (mbminlen == 2) {
+				/* space=0x0020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~1;
+
+				while (col_len >= 2 && ptr[col_len - 2] == 0x00
+						&& ptr[col_len - 1] == 0x20) {
+					col_len -= 2;
+				}
+			} else {
+				ut_a(mbminlen == 1);
+				/* space=0x20 */
+				while (col_len > 0
+						&& ptr[col_len - 1] == 0x20) {
+					col_len--;
+				}
+			}
+		}
+	} else if (comp && type == DATA_MYSQL
+			&& dtype_get_mbminlen(dtype) == 1
+			&& dtype_get_mbmaxlen(dtype) > 1) {
+		/* In some cases we strip trailing spaces from UTF-8 and other
+		multibyte charsets, from FIXED-length CHAR columns, to save
+		space. UTF-8 would otherwise normally use 3 * the string length
+		bytes to store a latin1 string! */
+
+		/* We assume that this CHAR field is encoded in a
+		variable-length character set where spaces have
+		1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+		Consider a CHAR(n) field, a field of n characters.
+		It will contain between n * mbminlen and n * mbmaxlen bytes.
+		We will try to truncate it to n bytes by stripping
+		space padding.  If the field contains single-byte
+		characters only, it will be truncated to n characters.
+		Consider a CHAR(5) field containing the string ".a   "
+		where "." denotes a 3-byte character represented by
+		the bytes "$%&".  After our stripping, the string will
+		be stored as "$%&a " (5 bytes).  The string ".abc "
+		will be stored as "$%&abc" (6 bytes).
+
+		The space padding will be restored in row0sel.c, function
+		row_sel_field_store_in_mysql_format(). */
+
+		ulint		n_chars;
+
+		ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+		n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+		/* Strip space padding. */
+		while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+			col_len--;
+		}
+	} else if (type == DATA_BLOB && row_format_col) {
+
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+	}
+
+	dfield_set_data(dfield, ptr, col_len);
+
+	return(buf);
+}
+
+/******************************************************************
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/* in/out: Innobase row where the
+					field type information is already
+					copied there! */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	byte*		mysql_rec)	/* in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+{
+	mysql_row_templ_t*	templ;	
+	dfield_t*		dfield;
+	ulint			i;
+	
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+		dfield = dtuple_get_nth_field(row, i);
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset] &
+ 					(byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+				goto next_column;
+			}
+		}			
+		
+		row_mysql_store_col_in_innobase_format(dfield,
+					prebuilt->ins_upd_rec_buff
+						+ templ->mysql_col_offset,
+					TRUE, /* MySQL row format data */
+					mysql_rec + templ->mysql_col_offset,
+					templ->mysql_col_len,
+					prebuilt->table->comp);
+next_column:
+		;
+	} 
+}
+
+/********************************************************************
+Handles user errors and lock waits detected by the database engine. */
+
+ibool
+row_mysql_handle_errors(
+/*====================*/
+				/* out: TRUE if it was a lock wait and
+				we should continue running the query thread */
+	ulint*		new_err,/* out: possible new error encountered in
+				lock wait, or if no new error, the value
+				of trx->error_state at the entry of this
+				function */
+	trx_t*		trx,	/* in: transaction */
+	que_thr_t*	thr,	/* in: query thread */
+	trx_savept_t*	savept)	/* in: savepoint or NULL */
+{
+#ifndef UNIV_HOTBACKUP
+	ulint	err;
+
+handle_new_error:
+	err = trx->error_state;
+	
+	ut_a(err != DB_SUCCESS);
+	
+	trx->error_state = DB_SUCCESS;
+
+	if (err == DB_DUPLICATE_KEY) {
+           	if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}
+	} else if (err == DB_TOO_BIG_RECORD) {
+           	if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}
+		/* MySQL will roll back the latest SQL statement */
+	} else if (err == DB_ROW_IS_REFERENCED
+		   || err == DB_NO_REFERENCED_ROW
+		   || err == DB_CANNOT_ADD_CONSTRAINT) {
+           	if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}
+		/* MySQL will roll back the latest SQL statement */
+	} else if (err == DB_LOCK_WAIT) {
+
+		srv_suspend_mysql_thread(thr);
+
+		if (trx->error_state != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		return(TRUE);
+
+	} else if (err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT
+		   || err == DB_LOCK_TABLE_FULL) {
+		/* Roll back the whole transaction; this resolution was added
+		to version 3.23.43 */
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+				
+	} else if (err == DB_OUT_OF_FILE_SPACE) {
+           	if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}
+		/* MySQL will roll back the latest SQL statement */
+
+	} else if (err == DB_MUST_GET_MORE_FILE_SPACE) {
+
+		fputs(
+		"InnoDB: The database cannot continue operation because of\n"
+		"InnoDB: lack of space. You must add a new data file to\n"
+		"InnoDB: my.cnf and restart the database.\n", stderr);
+		
+		exit(1);
+	} else if (err == DB_CORRUPTION) {
+
+	       fputs(
+	    "InnoDB: We detected index corruption in an InnoDB type table.\n"
+	    "InnoDB: You have to dump + drop + reimport the table or, in\n"
+	    "InnoDB: a case of widespread corruption, dump all InnoDB\n"
+	    "InnoDB: tables and recreate the whole InnoDB tablespace.\n"
+	    "InnoDB: If the mysqld server crashes after the startup or when\n"
+	    "InnoDB: you dump the tables, look at\n"
+	    "InnoDB: http://dev.mysql.com/doc/mysql/en/Forcing_recovery.html"
+	    " for help.\n", stderr);
+
+	} else {
+		fprintf(stderr, "InnoDB: unknown error code %lu\n",
+			(ulong) err);
+		ut_error;
+	}		
+
+	if (trx->error_state != DB_SUCCESS) {
+		*new_err = trx->error_state;
+	} else {
+		*new_err = err;
+	}
+	
+	trx->error_state = DB_SUCCESS;
+
+	return(FALSE);
+#else /* UNIV_HOTBACKUP */
+	/* This function depends on MySQL code that is not included in
+	InnoDB Hot Backup builds.  Besides, this function should never
+	be called in InnoDB Hot Backup. */
+	ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/************************************************************************
+Create a prebuilt struct for a MySQL table handle. */
+
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+				/* out, own: a prebuilt struct */
+	dict_table_t*	table)	/* in: Innobase table handle */
+{
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+	ulint		i;
+	
+	heap = mem_heap_create(128);
+
+	prebuilt = mem_heap_alloc(heap, sizeof(row_prebuilt_t));
+
+	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+	prebuilt->table = table;
+
+	prebuilt->trx = NULL;
+
+	prebuilt->sql_stat_start = TRUE;
+
+	prebuilt->mysql_has_locked = FALSE;
+
+	prebuilt->index = NULL;
+
+	prebuilt->used_in_HANDLER = FALSE;
+
+	prebuilt->n_template = 0;
+	prebuilt->mysql_template = NULL;
+
+	prebuilt->heap = heap;
+	prebuilt->ins_node = NULL;
+
+	prebuilt->ins_upd_rec_buff = NULL;
+	
+	prebuilt->upd_node = NULL;
+	prebuilt->ins_graph = NULL;
+	prebuilt->upd_graph = NULL;
+
+  	prebuilt->pcur = btr_pcur_create_for_mysql();
+  	prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = 99999999;
+
+	prebuilt->sel_graph = NULL;
+
+	prebuilt->search_tuple = dtuple_create(heap,
+					2 * dict_table_get_n_cols(table));
+	
+	clust_index = dict_table_get_first_index(table);
+
+	/* Make sure that search_tuple is long enough for clustered index */
+	ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		prebuilt->fetch_cache[i] = NULL;
+	}
+
+	prebuilt->n_fetch_cached = 0;
+
+	prebuilt->blob_heap = NULL;
+
+	prebuilt->old_vers_heap = NULL;
+
+	return(prebuilt);
+}
+
+/************************************************************************
+Free a prebuilt struct for a MySQL table handle. */
+
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt)	/* in, own: prebuilt struct */
+{
+	ulint	i;
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED
+	    || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+"InnoDB: Error: trying to free a corrupt\n"
+"InnoDB: table handle. Magic n %lu, magic n2 %lu, table name",
+		(ulong) prebuilt->magic_n,
+		(ulong) prebuilt->magic_n2);
+		ut_print_name(stderr, NULL, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption((byte*)prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->magic_n = ROW_PREBUILT_FREED;
+	prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+	btr_pcur_free_for_mysql(prebuilt->pcur);
+	btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+	if (prebuilt->mysql_template) {
+		mem_free(prebuilt->mysql_template);
+	}
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+	
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+	
+	if (prebuilt->blob_heap) {
+		mem_heap_free(prebuilt->blob_heap);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+	
+	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		if (prebuilt->fetch_cache[i] != NULL) {
+
+			if ((ROW_PREBUILT_FETCH_MAGIC_N !=
+			    mach_read_from_4((prebuilt->fetch_cache[i]) - 4))
+			    || (ROW_PREBUILT_FETCH_MAGIC_N !=
+			    mach_read_from_4((prebuilt->fetch_cache[i])
+			    			+ prebuilt->mysql_row_len))) {
+				fputs(
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: fetch buffer.\n", stderr);
+
+				mem_analyze_corruption(
+						prebuilt->fetch_cache[i]);
+
+				ut_error;
+			}
+
+			mem_free((prebuilt->fetch_cache[i]) - 4);
+		}
+	}
+
+	dict_table_decrement_handle_count(prebuilt->table);
+
+	mem_heap_free(prebuilt->heap);
+}
+
+/*************************************************************************
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+
+void
+row_update_prebuilt_trx(
+/*====================*/
+					/* out: prebuilt dtuple */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct in MySQL
+					handle */
+	trx_t*		trx)		/* in: transaction handle */
+{	
+	if (trx->magic_n != TRX_MAGIC_N) {
+		fprintf(stderr,
+		"InnoDB: Error: trying to use a corrupt\n"
+		"InnoDB: trx handle. Magic n %lu\n",
+		(ulong) trx->magic_n);
+
+		mem_analyze_corruption((byte*)trx);
+
+		ut_error;
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+		"InnoDB: Error: trying to use a corrupt\n"
+		"InnoDB: table handle. Magic n %lu, table name",
+		(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, NULL, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption((byte*)prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}	
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it. */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+					/* out: prebuilt dtuple; the column
+					type information is also set in it */ 
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	dict_table_t*	table	= prebuilt->table;
+	ulint		i;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+	
+	if (prebuilt->ins_node == NULL) {
+
+		/* Not called before for this handle: create an insert node
+		and query graph to the prebuilt struct */
+
+		node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+		
+		prebuilt->ins_node = node;
+
+		if (prebuilt->ins_upd_rec_buff == NULL) {
+			prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+						prebuilt->heap,
+						prebuilt->mysql_row_len);
+		}
+		
+		row = dtuple_create(prebuilt->heap,
+					dict_table_get_n_cols(table));
+
+		dict_table_copy_types(row, table);
+
+		/* We init the value of every field to the SQL NULL to avoid
+		a debug assertion from failing */
+
+		for (i = 0; i < dtuple_get_n_fields(row); i++) {
+		    
+		        dtuple_get_nth_field(row, i)->len = UNIV_SQL_NULL;
+		}
+
+		ins_node_set_new_row(node, row);
+
+		prebuilt->ins_graph =
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node,
+							prebuilt->trx,
+							prebuilt->heap));
+		prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->ins_node->row);	
+}
+
+/*************************************************************************
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+	dict_table_t*	table)	/* in: table */
+{
+	ulint	counter;
+	
+	counter = table->stat_modified_counter;
+
+	table->stat_modified_counter = counter + 1;
+
+	/* Calculate new statistics if 1 / 16 of table has been modified
+	since the last time a statistics batch was run, or if
+	stat_modified_counter > 2 000 000 000 (to avoid wrap-around).
+	We calculate statistics at most every 16th round, since we may have
+	a counter table which is very small and updated very often. */
+
+	if (counter > 2000000000
+	    || ((ib_longlong)counter > 16 + table->stat_n_rows / 16)) {
+
+		dict_update_statistics(table);
+	}	
+}
+		  	
+/*************************************************************************
+Unlocks an AUTO_INC type lock possibly reserved by trx. */
+
+void		  	
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	if (!trx->auto_inc_lock) {
+
+		return;
+	}
+
+	lock_table_unlock_auto_inc(trx);
+}
+
+/*************************************************************************
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table. */
+
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+					/* out: error code or DB_SUCCESS */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in the MySQL
+					table handle */
+{
+	trx_t*		trx 		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+	
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	
+	if (trx->auto_inc_lock) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "setting auto-inc lock";
+
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	/* We use the insert query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started(trx);
+
+	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Unlocks all table locks explicitly requested by trx (with LOCK TABLES,
+lock type LOCK_TABLE_EXP). */
+
+void		  	
+row_unlock_tables_for_mysql(
+/*========================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	if (!trx->n_lock_table_exp) {
+
+		return;
+	}
+
+	mutex_enter(&kernel_mutex);
+	lock_release_tables_off_kernel(trx);
+	mutex_exit(&kernel_mutex);
+}
+
+/*************************************************************************
+Sets a table lock on the table mentioned in prebuilt. */
+
+int
+row_lock_table_for_mysql(
+/*=====================*/
+					/* out: error code or DB_SUCCESS */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct in the MySQL
+					table handle */
+	dict_table_t*	table,		/* in: table to lock, or NULL
+					if prebuilt->table should be
+					locked or a
+					prebuilt->select_lock_type */
+	ulint		mode)		/* in: lock mode of table */
+{
+	trx_t*		trx 		= prebuilt->trx;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+	
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "setting table lock";
+
+	if (prebuilt->sel_graph == NULL) {
+		/* Build a dummy select query graph */
+		row_prebuild_sel_graph(prebuilt);
+	}
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started(trx);
+
+	if (table) {
+		err = lock_table(0, table, mode, thr);
+	} else {
+		if (mode == LOCK_TABLE_TRANSACTIONAL) {
+			err = lock_table(LOCK_TABLE_TRANSACTIONAL, 
+					prebuilt->table,
+					prebuilt->select_lock_type, thr);
+		} else {
+			err = lock_table(LOCK_TABLE_EXP, prebuilt->table,
+					prebuilt->select_lock_type, thr);
+		}
+	}
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+		
+	trx->op_info = "";
+
+	return((int) err);	
+}
+					
+/*************************************************************************
+Does an insert for MySQL. */
+
+int
+row_insert_for_mysql(
+/*=================*/
+					/* out: error code or DB_SUCCESS */
+	byte*		mysql_rec,	/* in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+	trx_t*		trx 		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+	
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (prebuilt->table->ibd_file_missing) {
+	        ut_print_timestamp(stderr);
+	        fprintf(stderr, "  InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+				prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+		"InnoDB: Error: trying to free a corrupt\n"
+		"InnoDB: table handle. Magic n %lu, table name",
+		(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, prebuilt->trx, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption((byte*)prebuilt);
+
+		ut_error;
+	}
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		fputs(
+		"InnoDB: A new raw disk partition was initialized or\n"
+		"InnoDB: innodb_force_recovery is on: we do not allow\n"
+		"InnoDB: database modifications by the user. Shut down\n"
+		"InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+		"InnoDB: with raw, and innodb_force_... is removed.\n",
+			stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "inserting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started(trx);
+
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
+	
+	savept = trx_savept_take(trx);
+	
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+	}
+	
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+	
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+								&savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+	
+	prebuilt->table->stat_n_rows++;
+
+	srv_n_rows_inserted++;
+	
+	if (prebuilt->table->stat_n_rows == 0) {
+		/* Avoid wrap-over */
+		prebuilt->table->stat_n_rows--;
+	}	
+
+	row_update_statistics_if_needed(prebuilt->table);
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Builds a dummy query graph used in selects. */
+
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+	
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+				
+		prebuilt->sel_graph =
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node,
+							prebuilt->trx,
+							prebuilt->heap));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*************************************************************************
+Creates an query graph node of 'update' type to be used in the MySQL
+interface. */
+
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+				/* out, own: update node */
+	dict_table_t*	table,	/* in: table to update */
+	mem_heap_t*	heap)	/* in: mem heap from which allocated */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(heap);
+		
+	node->in_mysql_interface = TRUE;
+	node->is_delete = FALSE;
+	node->searched_update = FALSE;
+	node->select_will_do_update = FALSE;
+	node->select = NULL;
+	node->pcur = btr_pcur_create_for_mysql();
+	node->table = table;
+
+	node->update = upd_create(dict_table_get_n_cols(table), heap);
+
+	node->update_n_fields = dict_table_get_n_cols(table);
+	
+	UT_LIST_INIT(node->columns);
+	node->has_clust_rec_x_lock = TRUE;
+	node->cmpl_info = 0;
+
+	node->table_sym = NULL;
+	node->col_assign_list = NULL;
+
+	return(node);
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it. */
+
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+					/* out: prebuilt update vector */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table	= prebuilt->table;
+	upd_node_t*	node;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+	
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		node = row_create_update_node_for_mysql(table, prebuilt->heap);
+
+		prebuilt->upd_node = node;
+		
+		prebuilt->upd_graph =
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node,
+							prebuilt->trx,
+							prebuilt->heap));
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/*************************************************************************
+Does an update or delete of a row for MySQL. */
+
+int
+row_update_for_mysql(
+/*=================*/
+					/* out: error code or DB_SUCCESS */
+	byte*		mysql_rec,	/* in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	ulint		err;
+	que_thr_t*	thr;
+	ibool		was_lock_wait;
+	dict_index_t*	clust_index; 
+/*	ulint		ref_len; */
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	UT_NOT_USED(mysql_rec);
+	
+	if (prebuilt->table->ibd_file_missing) {
+	        ut_print_timestamp(stderr);
+	        fprintf(stderr, "  InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+				prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+		"InnoDB: Error: trying to free a corrupt\n"
+		"InnoDB: table handle. Magic n %lu, table name",
+		(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, prebuilt->trx, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption((byte*)prebuilt);
+
+		ut_error;
+	}
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		fputs(
+		"InnoDB: A new raw disk partition was initialized or\n"
+		"InnoDB: innodb_force_recovery is on: we do not allow\n"
+		"InnoDB: database modifications by the user. Shut down\n"
+		"InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+		"InnoDB: with raw, and innodb_force_... is removed.\n",
+			stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "updating or deleting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started(trx);
+
+	node = prebuilt->upd_node;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (prebuilt->pcur->btr_cur.index == clust_index) {
+		btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur);
+	} else {
+		btr_pcur_copy_stored_position(node->pcur,
+							prebuilt->clust_pcur);
+	}
+		
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+	 	
+	/* MySQL seems to call rnd_pos before updating each row it
+	has cached: we can get the correct cursor position from
+	prebuilt->pcur; NOTE that we cannot build the row reference
+	from mysql_rec if the clustered index was automatically
+	generated for the table: MySQL does not know anything about
+	the row id used as the clustered index key */
+
+	savept = trx_savept_take(trx);
+	
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+		
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+			trx->op_info = "";
+
+			return((int) err);
+		}
+
+    thr->lock_state= QUE_THR_LOCK_ROW;
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+								&savept);
+    thr->lock_state= QUE_THR_LOCK_NOLOCK;;
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	if (node->is_delete) {
+		if (prebuilt->table->stat_n_rows > 0) {
+			prebuilt->table->stat_n_rows--;
+		}
+
+		srv_n_rows_deleted++;
+	} else {
+		srv_n_rows_updated++;
+	}
+
+	row_update_statistics_if_needed(prebuilt->table);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Does an unlock of a row for MySQL. */
+
+int
+row_unlock_for_mysql(
+/*=================*/
+					/* out: error code or DB_SUCCESS */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	rec_t*		rec;
+	btr_pcur_t*	cur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	mtr_t           mtr;
+	
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+		
+	trx->op_info = "unlock_row";
+	
+	if (srv_locks_unsafe_for_binlog) {
+		if (trx->trx_create_lock == TRUE) {
+
+			mtr_start(&mtr);
+			
+			/* Restore a cursor position and find a record */
+			btr_pcur_restore_position(BTR_SEARCH_LEAF, cur, &mtr);
+			rec = btr_pcur_get_rec(cur);
+
+			if (rec) {
+
+				lock_rec_reset_and_release_wait(rec);
+			} else {
+				fputs("InnoDB: Error: "
+				      "Record for the lock not found\n",
+				      stderr);
+				mem_analyze_corruption((byte*) trx);
+				ut_error;
+			}
+
+			trx->trx_create_lock = FALSE;
+			mtr_commit(&mtr);
+		}
+		
+	}
+			
+	trx->op_info = "";
+	
+	return(DB_SUCCESS);
+}
+
+/**************************************************************************
+Does a cascaded delete or set null in a foreign key operation. */
+
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+				/* out: error code or DB_SUCCESS */
+	que_thr_t*	thr,	/* in: query thread */
+	upd_node_t*	node,	/* in: update node used in the cascade
+				or set null operation */
+	dict_table_t*	table)	/* in: table where we do the operation */
+{
+	ulint	err;
+	trx_t*	trx;
+
+	trx = thr_get_trx(thr);
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	/* Note that the cascade node is a subnode of another InnoDB
+	query graph node. We do a normal lock wait in this node, but
+	all errors are handled by the parent node. */
+
+	if (err == DB_LOCK_WAIT) {
+		/* Handle lock wait here */
+	
+		que_thr_stop_for_mysql(thr);
+
+		srv_suspend_mysql_thread(thr);
+
+		/* Note that a lock wait may also end in a lock wait timeout,
+		or this transaction is picked as a victim in selective
+		deadlock resolution */
+
+		if (trx->error_state != DB_SUCCESS) {
+
+			return(trx->error_state);
+		}
+
+		/* Retry operation after a normal lock wait */
+		
+		goto run_again;
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (node->is_delete) {
+		if (table->stat_n_rows > 0) {
+			table->stat_n_rows--;
+		}
+
+		srv_n_rows_deleted++;
+	} else {
+		srv_n_rows_updated++;
+	}
+
+	row_update_statistics_if_needed(table);
+
+	return(err);
+}
+
+/*************************************************************************
+Checks if a table is such that we automatically created a clustered
+index on it (on row id). */
+
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	dict_table_t*	table)
+{
+	dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (dtype_get_mtype(dict_index_get_nth_type(clust_index, 0))
+	 							== DATA_SYS) {
+	 	return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************************
+Calculates the key number used inside MySQL for an Innobase index. We have
+to take into account if we generated a default clustered index for the table */
+
+ulint
+row_get_mysql_key_number_for_index(
+/*===============================*/
+	dict_index_t*	index)
+{
+	dict_index_t*	ind;
+	ulint		i;
+
+	ut_a(index);
+
+	i = 0;
+	ind = dict_table_get_first_index(index->table);
+
+	while (index != ind) {
+		ind = dict_table_get_next_index(ind);
+		i++;
+	}
+
+	if (row_table_got_default_clust_index(index->table)) {
+		ut_a(i > 0);
+		i--;
+	}
+
+	return(i);
+}
+
+/*************************************************************************
+Recovers an orphaned tmp table inside InnoDB by renaming it. In the table
+name #sql becomes rsql, and "_recover_innodb_tmp_table" is catenated to
+the end of name. table->name should be of the form
+"dbname/rsql..._recover_innodb_tmp_table". This renames a table whose
+name is "#sql..." */
+static
+int
+row_mysql_recover_tmp_table(
+/*========================*/
+				/* out: error code or DB_SUCCESS */
+	dict_table_t*	table,	/* in: table definition */
+	trx_t*		trx)	/* in: transaction handle */
+{
+	const char*	ptr	= strstr(table->name, "/rsql");
+
+	if (!ptr) {
+		/* table name does not begin with "/rsql" */
+		trx_commit_for_mysql(trx);
+		return(DB_ERROR);
+	}
+	else {
+		int	status;
+		int	namelen = (int) strlen(table->name);
+		char*	old_name = mem_strdupl(table->name, namelen);
+		/* replace "rsql" with "#sql" */
+		old_name[ptr - table->name + 1] = '#';
+		/* remove "_recover_innodb_tmp_table" suffix */
+		ut_ad(namelen > (int) sizeof S_recover_innodb_tmp_table);
+		ut_ad(!strcmp(old_name + namelen + 1 -
+			sizeof S_recover_innodb_tmp_table,
+			S_recover_innodb_tmp_table));
+		old_name[namelen + 1 - sizeof S_recover_innodb_tmp_table] = 0;
+		status = row_rename_table_for_mysql(old_name,
+						table->name, trx);
+		mem_free(old_name);
+		return(status);
+	}
+}
+
+/*************************************************************************
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+
+void
+row_mysql_freeze_data_dictionary(
+/*=============================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == 0);
+	
+	rw_lock_s_lock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*************************************************************************
+Unlocks the data dictionary shared lock. */
+
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+	rw_lock_s_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*************************************************************************
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+
+void
+row_mysql_lock_data_dictionary(
+/*===========================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == 0
+	     || trx->dict_operation_lock_mode == RW_X_LATCH);
+	
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks or lock waits can occur then in these operations */
+
+	rw_lock_x_lock(&dict_operation_lock);
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/*************************************************************************
+Unlocks the data dictionary exclusive lock. */
+
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_exit(&(dict_sys->mutex));
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*************************************************************************
+Does a table creation operation for MySQL.  If the name of the table
+to be created is equal with one of the predefined magic table names,
+then this also starts printing the corresponding monitor output by
+the master thread. */
+
+int
+row_create_table_for_mysql(
+/*=======================*/
+				/* out: error code or DB_SUCCESS */
+	dict_table_t*	table,	/* in: table definition */
+	trx_t*		trx)	/* in: transaction handle */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	const char*	table_name;
+	ulint		table_name_len;
+	ulint		err;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	
+	if (srv_created_new_raw) {
+		fputs(
+		"InnoDB: A new raw disk partition was initialized or\n"
+		"InnoDB: innodb_force_recovery is on: we do not allow\n"
+		"InnoDB: database modifications by the user. Shut down\n"
+		"InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+		"InnoDB: with raw, and innodb_force_... is removed.\n",
+		stderr);
+
+		trx_commit_for_mysql(trx);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "creating table";
+	
+	if (row_mysql_is_system_table(table->name)) {
+
+		fprintf(stderr,
+    "InnoDB: Error: trying to create a MySQL system table %s of type InnoDB.\n"
+    "InnoDB: MySQL system tables must be of the MyISAM type!\n",
+		table->name);
+
+		trx_commit_for_mysql(trx);
+
+		return(DB_ERROR);
+	}
+
+	trx_start_if_not_started(trx);
+
+	if (row_mysql_is_recovered_tmp_table(table->name)) {
+
+		/* MySQL prevents accessing of tables whose name begins
+		with #sql, that is temporary tables. If mysqld crashes in
+		the middle of an ALTER TABLE, we may get an orphaned
+		#sql-table in the tablespace. We have here a special
+		mechanism to recover such tables by renaming them to
+		rsql... */
+ 				
+		return(row_mysql_recover_tmp_table(table, trx));
+	}
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = strchr(table->name, '/');
+	ut_a(table_name);
+	table_name++;
+	table_name_len = strlen(table_name) + 1;
+
+	if (table_name_len == sizeof S_innodb_monitor
+			&& !memcmp(table_name, S_innodb_monitor,
+				sizeof S_innodb_monitor)) {
+
+		/* Table equals "innodb_monitor":
+		start monitor prints */
+ 				
+		srv_print_innodb_monitor = TRUE;
+
+		/* The lock timeout monitor thread also takes care
+		of InnoDB monitor prints */
+
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (table_name_len == sizeof S_innodb_lock_monitor
+			&& !memcmp(table_name, S_innodb_lock_monitor,
+				sizeof S_innodb_lock_monitor)) {
+
+		srv_print_innodb_monitor = TRUE;
+		srv_print_innodb_lock_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (table_name_len == sizeof S_innodb_tablespace_monitor
+			&& !memcmp(table_name, S_innodb_tablespace_monitor,
+				sizeof S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (table_name_len == sizeof S_innodb_table_monitor
+			&& !memcmp(table_name, S_innodb_table_monitor,
+				sizeof S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (table_name_len == sizeof S_innodb_mem_validate
+			&& !memcmp(table_name, S_innodb_mem_validate,
+				sizeof S_innodb_mem_validate)) {
+	        /* We define here a debugging feature intended for
+		developers */
+
+		fputs("Validating InnoDB memory:\n"
+		 "to use this feature you must compile InnoDB with\n"
+		 "UNIV_MEM_DEBUG defined in univ.i and the server must be\n"
+		 "quiet because allocation from a mem heap is not protected\n"
+		"by any semaphore.\n", stderr);
+#ifdef UNIV_MEM_DEBUG
+		ut_a(mem_validate());
+		fputs("Memory validated\n", stderr);
+#else /* UNIV_MEM_DEBUG */
+		fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
+			stderr);
+#endif /* UNIV_MEM_DEBUG */
+	}
+
+	heap = mem_heap_create(512);
+
+	trx->dict_operation = TRUE;
+	
+	node = tab_create_graph_create(table, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+		
+		trx->error_state = DB_SUCCESS;
+		
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			fputs("InnoDB: Warning: cannot create table ", stderr);
+			ut_print_name(stderr, trx, table->name);
+			fputs(" because tablespace full\n", stderr);
+		     	row_drop_table_for_mysql(table->name, trx, FALSE);
+
+		} else if (err == DB_DUPLICATE_KEY) {
+	    		ut_print_timestamp(stderr);
+
+			fputs("  InnoDB: Error: table ", stderr);
+			ut_print_name(stderr, trx, table->name);
+			fputs(" already exists in InnoDB internal\n"
+     "InnoDB: data dictionary. Have you deleted the .frm file\n"
+     "InnoDB: and not used DROP TABLE? Have you used DROP DATABASE\n"
+     "InnoDB: for InnoDB tables in MySQL version <= 3.23.43?\n"
+     "InnoDB: See the Restrictions section of the InnoDB manual.\n"
+     "InnoDB: You can drop the orphaned table inside InnoDB by\n"
+     "InnoDB: creating an InnoDB table with the same name in another\n"
+     "InnoDB: database and copying the .frm file to the current database.\n"
+     "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n"
+     "InnoDB: succeed.\n"
+     "InnoDB: You can look for further help from\n"
+     "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+     "InnoDB_troubleshooting_datadict.html\n", stderr);
+		}
+		
+		/* We may also get err == DB_ERROR if the .ibd file for the
+		table already exists */
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table. */
+
+int
+row_create_index_for_mysql(
+/*=======================*/
+					/* out: error number or DB_SUCCESS */
+	dict_index_t*	index,		/* in: index definition */
+	trx_t*		trx)		/* in: transaction handle */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+	ulint		i, j;
+	
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	
+	trx->op_info = "creating index";
+
+	trx_start_if_not_started(trx);
+
+	/* Check that the same column does not appear twice in the index.
+	Starting from 4.0.14, InnoDB should be able to cope with that, but
+	safer not to allow them. */
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		for (j = 0; j < i; j++) {
+			if (0 == ut_strcmp(
+			      dict_index_get_nth_field(index, j)->name,
+			      dict_index_get_nth_field(index, i)->name)) {
+
+				ut_print_timestamp(stderr);
+
+				fputs("  InnoDB: Error: column ", stderr);
+				ut_print_name(stderr, trx,
+				dict_index_get_nth_field(index, i)->name);
+				fputs(" appears twice in ", stderr);
+				dict_index_name_print(stderr, trx, index);
+				fputs("\n"
+				"InnoDB: This is not allowed in InnoDB.\n",
+					stderr);
+
+				err = DB_COL_APPEARS_TWICE_IN_INDEX;
+
+				goto error_handling;
+			}
+		}
+		
+		/* Check also that prefix_len < DICT_MAX_COL_PREFIX_LEN */
+
+		if (dict_index_get_nth_field(index, i)->prefix_len
+						>= DICT_MAX_COL_PREFIX_LEN) {
+			err = DB_TOO_BIG_RECORD;
+
+			goto error_handling;
+		}
+	}
+
+	if (row_mysql_is_recovered_tmp_table(index->table_name)) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap = mem_heap_create(512);
+
+	trx->dict_operation = TRUE;
+
+	/* Note that the space id where we store the index is inherited from
+	the table in dict_build_index_def_step() in dict0crea.c. */
+
+	node = ind_create_graph_create(index, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+ 	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+error_handling:
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+		
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		row_drop_table_for_mysql(index->table_name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+	
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint. Check also that foreign key
+constraints which reference this table are ok. */
+
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+					/* out: error code or DB_SUCCESS */
+	trx_t*		trx,		/* in: transaction */
+	const char*	sql_string,	/* in: table create statement where
+					foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+					table2 can be written also with the
+					database name before it: test.table2 */
+	const char*	name)		/* in: table full name in the
+					normalized form
+					database_name/table_name */
+{
+	ulint	err;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_a(sql_string);
+	
+	trx->op_info = "adding foreign keys";
+
+	trx_start_if_not_started(trx);
+
+	if (row_mysql_is_recovered_tmp_table(name)) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->dict_operation = TRUE;
+
+	err = dict_create_foreign_constraints(trx, sql_string, name);
+
+	if (err == DB_SUCCESS) {
+		/* Check that also referencing constraints are ok */
+		err = dict_load_foreigns(name, trx->check_foreigns);
+	}
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+		
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		row_drop_table_for_mysql(name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return((int) err);
+}
+
+/*************************************************************************
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.c. */
+static
+int
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+				/* out: error code or DB_SUCCESS */
+	const char*	name)	/* in: table name */
+{
+	ulint	error;
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	/* If the original transaction was dropping a table referenced by
+	foreign keys, we must set the following to be able to drop the
+	table: */
+
+	trx->check_foreigns = FALSE;
+
+/*	fputs("InnoDB: Error: Dropping table ", stderr);
+	ut_print_name(stderr, name);
+	fputs(" in background drop list\n", stderr); */
+
+  	/* Try to drop the table in InnoDB */
+
+  	error = row_drop_table_for_mysql(name, trx, FALSE);
+  	
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+	
+	log_buffer_flush_to_disk();
+
+  	trx_commit_for_mysql(trx);
+
+  	trx_free_for_background(trx);
+
+	return((int) error);
+}
+
+/*************************************************************************
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix. */
+
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+					/* out: how many tables dropped
+					+ remaining tables in list */
+{
+	row_mysql_drop_t*	drop;
+	dict_table_t*		table;
+	ulint			n_tables;
+	ulint			n_tables_dropped = 0;
+loop:	
+	mutex_enter(&kernel_mutex);
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+	
+	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+	mutex_exit(&kernel_mutex);
+
+	if (drop == NULL) {
+		/* All tables dropped */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+	table = dict_table_get_low(drop->table_name);
+	mutex_exit(&(dict_sys->mutex));
+
+	if (table == NULL) {
+	        /* If for some reason the table has already been dropped
+		through some other mechanism, do not try to drop it */
+
+	        goto already_dropped;
+	}
+							
+	if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
+							drop->table_name)) {
+		/* If the DROP fails for some table, we return, and let the
+		main thread retry later */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	n_tables_dropped++;
+
+already_dropped:
+	mutex_enter(&kernel_mutex);
+
+	UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+        ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Dropped table %s in background drop queue.\n",
+		drop->table_name);
+
+	mem_free(drop->table_name);
+
+	mem_free(drop);
+
+	mutex_exit(&kernel_mutex);
+
+	goto loop;
+}
+
+/*************************************************************************
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex! */
+
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+					/* out: how many tables in list */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+	
+	return(UT_LIST_GET_LEN(row_mysql_drop_list));
+}
+
+/*************************************************************************
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily. */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+				/* out: TRUE if the table was not yet in the
+				drop list, and was added there */
+	dict_table_t*	table)	/* in: table */
+{
+	row_mysql_drop_t*	drop;
+	
+	mutex_enter(&kernel_mutex);
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+	
+	/* Look if the table already is in the drop list */
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+	while (drop != NULL) {
+		if (strcmp(drop->table_name, table->name) == 0) {
+			/* Already in the list */
+			
+			mutex_exit(&kernel_mutex);
+
+			return(FALSE);
+		}
+
+		drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
+	}
+
+	drop = mem_alloc(sizeof(row_mysql_drop_t));
+
+	drop->table_name = mem_strdup(table->name);
+ 
+	UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
+	
+/*	fputs("InnoDB: Adding table ", stderr);
+	ut_print_name(stderr, drop->table_name);
+	fputs(" to background drop list\n", stderr); */
+
+	mutex_exit(&kernel_mutex);
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************************
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE. */
+
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+				/* out: error code or DB_SUCCESS */
+	const char*	name,	/* in: table name */
+	trx_t*		trx)	/* in: transaction handle */
+{
+	dict_foreign_t*	foreign;
+	dulint		new_id;
+	dict_table_t*	table;
+	que_thr_t*	thr;
+	que_t*		graph			= NULL;
+	ibool		success;
+	ulint		err;
+	char*		buf;
+
+/* How do we prevent crashes caused by ongoing operations on the table? Old
+operations could try to access non-existent pages.
+
+1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock
+on the table before we can do DISCARD TABLESPACE. Then there are no running
+queries on the table.
+2) Purge and rollback: we assign a new table id for the table. Since purge and
+rollback look for the table based on the table id, they see the table as
+'dropped' and discard their operations.
+3) Insert buffer: we remove all entries for the tablespace in the insert
+buffer tree; as long as the tablespace mem object does not exist, ongoing
+insert buffer page merges are discarded in buf0rea.c. If we recreate the
+tablespace mem object with IMPORT TABLESPACE later, then the tablespace will
+have the same id, but the tablespace_version field in the mem object is
+different, and ongoing old insert buffer page merges get discarded.
+4) Linear readahead and random readahead: we use the same method as in 3) to
+discard ongoing operations.
+5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we
+do not allow the discard. We also reserve the data dictionary latch. */
+
+	static const char discard_tablespace_proc1[] =
+	"PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
+	"old_id CHAR;\n"
+	"new_id CHAR;\n"
+	"new_id_low INT;\n"
+	"new_id_high INT;\n"
+	"table_name CHAR;\n"
+	"BEGIN\n"
+	"table_name := '";
+	static const char discard_tablespace_proc2[] =
+	"';\n"
+	"new_id_high := %lu;\n"
+	"new_id_low := %lu;\n"
+   "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n"
+	"SELECT ID INTO old_id\n"
+	"FROM SYS_TABLES\n"
+	"WHERE NAME = table_name;\n"
+	"IF (SQL %% NOTFOUND) THEN\n"
+	"	COMMIT WORK;\n"
+	"	RETURN;\n"
+	"END IF;\n"
+	"UPDATE SYS_TABLES SET ID = new_id\n"
+	"WHERE ID = old_id;\n"
+	"UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n"
+	"WHERE TABLE_ID = old_id;\n"
+	"UPDATE SYS_INDEXES SET TABLE_ID = new_id\n"
+	"WHERE TABLE_ID = old_id;\n"
+	"COMMIT WORK;\n"
+	"END;\n";
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "discarding tablespace";
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	if (table->space == 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, name);
+		fputs("\n"
+"InnoDB: is in the system tablespace 0 which cannot be discarded\n", stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+	        ut_print_timestamp(stderr);
+		fputs("	 InnoDB: You are trying to DISCARD table ", stderr);
+		ut_print_name(stderr, trx, table->name);
+		fputs("\n"
+		 "InnoDB: though there is a foreign key check running on it.\n"
+		 "InnoDB: Cannot discard the table.\n",
+			stderr);
+
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns) {
+
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow discarding a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		err = DB_CANNOT_DROP_CONSTRAINT;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot DISCARD table ", ef);
+		ut_print_name(ef, trx, name);
+		fputs("\n"
+			"because it is referenced by ", ef);
+		ut_print_name(ef, trx, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		goto funct_exit;
+	}
+
+	new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+	buf = mem_alloc((sizeof discard_tablespace_proc1) +
+			(sizeof discard_tablespace_proc2) +
+			20 + ut_strlenq(name, '\''));
+
+	memcpy(buf, discard_tablespace_proc1, sizeof discard_tablespace_proc1);
+	sprintf(ut_strcpyq(buf + (sizeof discard_tablespace_proc1 - 1),
+			'\'', name),
+		discard_tablespace_proc2,
+		(ulong) ut_dulint_get_high(new_id),
+		(ulong) ut_dulint_get_low(new_id));
+
+	graph = pars_sql(buf);
+
+	ut_a(graph);
+
+	/* Remove any locks there are on the table or its records */
+	
+	lock_reset_all_on_table(table);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		dict_table_change_id_in_cache(table, new_id);
+
+		success = fil_discard_tablespace(table->space);
+
+		if (!success) {
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, FALSE, NULL);
+			trx->error_state = DB_SUCCESS;
+
+			err = DB_ERROR;
+		} else {
+			/* Set the flag which tells that now it is legal to
+			IMPORT a tablespace for this table */
+			table->tablespace_discarded = TRUE;
+			table->ibd_file_missing = TRUE;
+		}
+	}
+funct_exit:	
+	row_mysql_unlock_data_dictionary(trx);
+
+	if (graph) {
+		que_graph_free(graph);
+	}
+
+  	trx_commit_for_mysql(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary. */
+
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+				/* out: error code or DB_SUCCESS */
+	const char*	name,	/* in: table name */
+	trx_t*		trx)	/* in: transaction handle */
+{
+	dict_table_t*	table;
+	ibool		success;
+	dulint		current_lsn;
+	ulint		err		= DB_SUCCESS;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx_start_if_not_started(trx);
+
+	trx->op_info = "importing tablespace";
+
+	current_lsn = log_get_lsn();
+	
+	/* It is possible, though very improbable, that the lsn's in the
+	tablespace to be imported have risen above the current system lsn, if
+	a lengthy purge, ibuf merge, or rollback was performed on a backup
+	taken with ibbackup. If that is the case, reset page lsn's in the
+	file. We assume that mysqld was shut down after it performed these
+	cleanup operations on the .ibd file, so that it stamped the latest lsn
+	to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+
+	TODO: reset also the trx id's in clustered index records and write
+	a new space id to each data page. That would allow us to import clean
+	.ibd files from another MySQL installation. */
+
+	success = fil_reset_too_high_lsns(name, current_lsn);
+
+	if (!success) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: cannot reset lsn's in table ", stderr);
+		ut_print_name(stderr, trx, name);
+		fputs("\n"
+		"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", stderr);
+
+		err = DB_ERROR;
+
+		row_mysql_lock_data_dictionary(trx);
+
+		goto funct_exit;
+	}
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: table ", stderr);
+		ut_print_name(stderr, trx, name);
+		fputs("\n"
+"InnoDB: does not exist in the InnoDB data dictionary\n"
+"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+		stderr);
+
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	if (table->space == 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, name);
+		fputs("\n"
+"InnoDB: is in the system tablespace 0 which cannot be imported\n", stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	if (!table->tablespace_discarded) {
+		ut_print_timestamp(stderr);
+		fputs(
+"  InnoDB: Error: you are trying to IMPORT a tablespace\n"
+"InnoDB: ", stderr);
+		ut_print_name(stderr, trx, name);
+		fputs(", though you have not called DISCARD on it yet\n"
+"InnoDB: during the lifetime of the mysqld process!\n", stderr);
+
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Play safe and remove all insert buffer entries, though we should
+	have removed them already when DISCARD TABLESPACE was called */
+
+	ibuf_delete_for_discarded_space(table->space);
+
+	success = fil_open_single_table_tablespace(TRUE, table->space,
+								table->name);
+	if (success) {
+		table->ibd_file_missing = FALSE;
+		table->tablespace_discarded = FALSE;
+	} else {
+		if (table->ibd_file_missing) {
+			ut_print_timestamp(stderr);
+			fputs(
+"  InnoDB: cannot find or open in the database directory the .ibd file of\n"
+"InnoDB: table ", stderr);
+			ut_print_name(stderr, trx, name);
+			fputs("\n"
+"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+			stderr);
+		}
+
+		err = DB_ERROR;
+	}
+
+funct_exit:	
+	row_mysql_unlock_data_dictionary(trx);
+
+  	trx_commit_for_mysql(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Truncates a table for MySQL. */
+
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+				/* out: error code or DB_SUCCESS */
+	dict_table_t*	table,	/* in: table handle */
+	trx_t*		trx)	/* in: transaction handle */
+{
+	dict_foreign_t*	foreign;
+	ulint		err;
+	mem_heap_t*	heap;
+	byte*		buf;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	dulint		new_id;
+	char*		sql;
+	que_thr_t*	thr;
+	que_t*		graph			= NULL;
+
+/* How do we prevent crashes caused by ongoing operations on the table? Old
+operations could try to access non-existent pages.
+
+1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock
+on the table before we can do TRUNCATE TABLE. Then there are no running
+queries on the table. This is guaranteed, because in
+ha_innobase::store_lock(), we do not weaken the TL_WRITE lock requested
+by MySQL when executing SQLCOM_TRUNCATE.
+2) Purge and rollback: we assign a new table id for the table. Since purge and
+rollback look for the table based on the table id, they see the table as
+'dropped' and discard their operations.
+3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, so we do not
+have to remove insert buffer records, as the insert buffer works at a low
+level. If a freed page is later reallocated, the allocator will remove
+the ibuf entries for it.
+
+TODO: when we truncate *.ibd files (analogous to DISCARD TABLESPACE), we
+will have to remove we remove all entries for the table in the insert
+buffer tree!
+
+4) Linear readahead and random readahead: we use the same method as in 3) to
+discard ongoing operations. (This will only be relevant for TRUNCATE TABLE
+by DISCARD TABLESPACE.)
+5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we
+do not allow the TRUNCATE. We also reserve the data dictionary latch. */
+
+	static const char renumber_tablespace_proc[] =
+	"PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+	"old_id CHAR;\n"
+	"new_id CHAR;\n"
+	"old_id_low INT;\n"
+	"old_id_high INT;\n"
+	"new_id_low INT;\n"
+	"new_id_high INT;\n"
+	"BEGIN\n"
+	"old_id_high := %lu;\n"
+	"old_id_low := %lu;\n"
+	"new_id_high := %lu;\n"
+	"new_id_low := %lu;\n"
+   "old_id := CONCAT(TO_BINARY(old_id_high, 4), TO_BINARY(old_id_low, 4));\n"
+   "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n"
+	"UPDATE SYS_TABLES SET ID = new_id\n"
+	"WHERE ID = old_id;\n"
+	"UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n"
+	"WHERE TABLE_ID = old_id;\n"
+	"UPDATE SYS_INDEXES SET TABLE_ID = new_id\n"
+	"WHERE TABLE_ID = old_id;\n"
+	"COMMIT WORK;\n"
+	"END;\n";
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(table);
+
+	if (srv_created_new_raw) {
+		fputs(
+		"InnoDB: A new raw disk partition was initialized or\n"
+		"InnoDB: innodb_force_recovery is on: we do not allow\n"
+		"InnoDB: database modifications by the user. Shut down\n"
+		"InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+		"InnoDB: with raw, and innodb_force_... is removed.\n",
+                stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "truncating table";
+
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+	/* Prevent foreign key checks etc. while we are truncating the
+	table */
+
+	row_mysql_lock_data_dictionary(trx);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns) {
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow truncating a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot truncate table ", ef);
+		ut_print_name(ef, trx, table->name);
+		fputs(" by DROP+CREATE\n"
+			"InnoDB: because it is referenced by ", ef);
+		ut_print_name(ef, trx, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		err = DB_ERROR;
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	they can cope with the table having been truncated here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+		ut_print_timestamp(stderr);
+		fputs("	 InnoDB: Cannot truncate table ", stderr);
+		ut_print_name(stderr, trx, table->name);
+		fputs(" by DROP+CREATE\n"
+"InnoDB: because there is a foreign key check running on it.\n",
+			stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Remove any locks there are on the table or its records */
+
+	lock_reset_all_on_table(table);
+
+	trx->table_id = table->id;
+
+	/* scan SYS_INDEXES for all indexes of the table */
+	heap = mem_heap_create(800);
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	sys_index = dict_table_get_first_index(dict_sys->sys_indexes);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	mtr_start(&mtr);
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+						BTR_MODIFY_LEAF, &pcur, &mtr);
+	for (;;) {
+		rec_t*		rec;
+		const byte*	field;
+		ulint		len;
+		ulint		root_page_no;
+
+		if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) {
+			/* The end of SYS_INDEXES has been reached. */
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_ad(len == 8);
+
+		if (memcmp(buf, field, len) != 0) {
+			/* End of indexes for the table (TABLE_ID mismatch). */
+			break;
+		}
+
+		if (rec_get_deleted_flag(rec, FALSE)) {
+			/* The index has been dropped. */
+			goto next_rec;
+		}
+
+		btr_pcur_store_position(&pcur, &mtr);
+
+		/* This call may commit and restart mtr. */
+		root_page_no = dict_truncate_index_tree(table, rec, &mtr);
+
+		btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (root_page_no != FIL_NULL) {
+			page_rec_write_index_page_no(rec,
+					DICT_SYS_INDEXES_PAGE_NO_FIELD,
+					root_page_no, &mtr);
+			/* We will need to commit and restart the
+			mini-transaction in order to avoid deadlocks.
+			The dict_truncate_index_tree() call has allocated
+			a page in this mini-transaction, and the rest of
+			this loop could latch another index page. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			btr_pcur_restore_position(BTR_MODIFY_LEAF,
+							&pcur, &mtr);
+		}
+
+	next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+	mem_heap_empty(heap);
+	sql = mem_heap_alloc(heap, (sizeof renumber_tablespace_proc) + 40);
+	sprintf(sql, renumber_tablespace_proc,
+		(ulong) ut_dulint_get_high(table->id),
+		(ulong) ut_dulint_get_low(table->id),
+		(ulong) ut_dulint_get_high(new_id),
+		(ulong) ut_dulint_get_low(new_id));
+
+	graph = pars_sql(sql);
+
+	ut_a(graph);
+
+	mem_heap_free(heap);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	thr = que_fork_start_command(graph);
+	ut_a(thr);
+
+	que_run_threads(thr);
+
+	que_graph_free(graph);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+		ut_print_timestamp(stderr);
+fputs("	 InnoDB: Unable to assign a new identifier to table ", stderr);
+		ut_print_name(stderr, trx, table->name);
+		fputs("\n"
+"InnoDB: after truncating it.  Background processes may corrupt the table!\n",
+			stderr);
+		err = DB_ERROR;
+	} else {
+		dict_table_change_id_in_cache(table, new_id);
+	}
+
+	dict_table_autoinc_initialize(table, 0);
+	dict_update_statistics(table);
+
+  	trx_commit_for_mysql(trx);
+
+funct_exit:
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	return((int) err);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************************
+Drops a table for MySQL. If the name of the table to be dropped is equal
+with one of the predefined magic table names, then this also stops printing
+the corresponding monitor output by the master thread. */
+
+int
+row_drop_table_for_mysql(
+/*=====================*/
+				/* out: error code or DB_SUCCESS */
+	const char*	name,	/* in: table name */
+	trx_t*		trx,	/* in: transaction handle */
+	ibool		drop_db)/* in: TRUE=dropping whole database */
+{
+	dict_foreign_t*	foreign;
+	dict_table_t*	table;
+	ulint		space_id;
+	que_thr_t*	thr;
+	que_t*		graph;
+	ulint		err;
+	const char*	table_name;
+	ulint		namelen;
+	char*		dir_path_of_temp_table	= NULL;
+	ibool		success;
+	ibool		locked_dictionary	= FALSE;
+	char*		quoted_name;
+	char*		sql;
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	static const char str1[] =
+	"PROCEDURE DROP_TABLE_PROC () IS\n"
+	"table_name CHAR;\n"
+	"sys_foreign_id CHAR;\n"
+	"table_id CHAR;\n"
+	"index_id CHAR;\n"
+	"foreign_id CHAR;\n"
+	"found INT;\n"
+	"BEGIN\n"
+	"table_name := ";
+	static const char str2[] =
+	";\n"
+	"SELECT ID INTO table_id\n"
+	"FROM SYS_TABLES\n"
+	"WHERE NAME = table_name;\n"
+	"IF (SQL % NOTFOUND) THEN\n"
+	"	COMMIT WORK;\n"
+	"	RETURN;\n"
+	"END IF;\n"
+	"found := 1;\n"
+	"SELECT ID INTO sys_foreign_id\n"
+	"FROM SYS_TABLES\n"
+	"WHERE NAME = 'SYS_FOREIGN';\n"
+	"IF (SQL % NOTFOUND) THEN\n"
+	"	found := 0;\n"
+	"END IF;\n"
+	"IF (table_name = 'SYS_FOREIGN') THEN\n"
+	"	found := 0;\n"
+	"END IF;\n"
+	"IF (table_name = 'SYS_FOREIGN_COLS') THEN\n"
+	"	found := 0;\n"
+	"END IF;\n"
+	"WHILE found = 1 LOOP\n"
+	"	SELECT ID INTO foreign_id\n"
+	"	FROM SYS_FOREIGN\n"
+	"	WHERE FOR_NAME = table_name\n"
+        "             AND TO_BINARY(FOR_NAME) = TO_BINARY(table_name);\n"
+	"	IF (SQL % NOTFOUND) THEN\n"
+	"		found := 0;\n"
+	"	ELSE"
+	"		DELETE FROM SYS_FOREIGN_COLS WHERE ID = foreign_id;\n"
+	"		DELETE FROM SYS_FOREIGN WHERE ID = foreign_id;\n"
+	"	END IF;\n"
+	"END LOOP;\n"
+	"found := 1;\n"
+	"WHILE found = 1 LOOP\n"
+	"	SELECT ID INTO index_id\n"
+	"	FROM SYS_INDEXES\n"
+	"	WHERE TABLE_ID = table_id;\n"	
+	"	IF (SQL % NOTFOUND) THEN\n"
+	"		found := 0;\n"
+	"	ELSE"
+	"		DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n"
+	"		DELETE FROM SYS_INDEXES WHERE ID = index_id\n"
+	"					 AND TABLE_ID = table_id;\n"
+	"	END IF;\n"
+	"END LOOP;\n"
+	"DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n"
+	"DELETE FROM SYS_TABLES WHERE ID = table_id;\n"
+	"COMMIT WORK;\n"
+	"END;\n";
+
+	ut_a(name != NULL);
+
+	if (srv_created_new_raw) {
+		fputs(
+		"InnoDB: A new raw disk partition was initialized or\n"
+		"InnoDB: innodb_force_recovery is on: we do not allow\n"
+		"InnoDB: database modifications by the user. Shut down\n"
+		"InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+		"InnoDB: with raw, and innodb_force_... is removed.\n",
+                stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "dropping table";
+
+	trx_start_if_not_started(trx);
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = strchr(name, '/');
+	ut_a(table_name);
+	table_name++;
+	namelen = strlen(table_name) + 1;
+
+	if (namelen == sizeof S_innodb_monitor
+			&& !memcmp(table_name, S_innodb_monitor,
+				sizeof S_innodb_monitor)) {
+
+		/* Table name equals "innodb_monitor":
+		stop monitor prints */
+ 				
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_lock_monitor
+			&& !memcmp(table_name, S_innodb_lock_monitor,
+				sizeof S_innodb_lock_monitor)) {
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_tablespace_monitor
+			&& !memcmp(table_name, S_innodb_tablespace_monitor,
+				sizeof S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_table_monitor
+			&& !memcmp(table_name, S_innodb_table_monitor,
+				sizeof S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = FALSE;
+	}
+
+	quoted_name = mem_strdupq(name, '\'');
+	namelen = strlen(quoted_name);
+	sql = mem_alloc((sizeof str1) + (sizeof str2) - 2 + 1 + namelen);
+	memcpy(sql, str1, (sizeof str1) - 1);
+	memcpy(sql + (sizeof str1) - 1, quoted_name, namelen);
+	memcpy(sql + (sizeof str1) - 1 + namelen, str2, sizeof str2);
+	mem_free(quoted_name);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+		/* Prevent foreign key checks etc. while we are dropping the
+		table */
+
+		row_mysql_lock_data_dictionary(trx);
+
+		locked_dictionary = TRUE;
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	
+	graph = pars_sql(sql);
+
+	ut_a(graph);
+	mem_free(sql);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+	    	ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, name);
+		fputs(" does not exist in the InnoDB internal\n"
+     	"InnoDB: data dictionary though MySQL is trying to drop it.\n"
+     	"InnoDB: Have you copied the .frm file of the table to the\n"
+	"InnoDB: MySQL database directory from another database?\n"
+	"InnoDB: You can look for further help from\n"
+	"InnoDB: http://dev.mysql.com/doc/mysql/en/"
+	"InnoDB_troubleshooting_datadict.html\n", stderr);
+		goto funct_exit;
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns &&
+		!(drop_db && dict_tables_have_same_db(
+			name, foreign->foreign_table_name))) {
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow dropping a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		err = DB_CANNOT_DROP_CONSTRAINT;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot drop table ", ef);
+		ut_print_name(ef, trx, name);
+		fputs("\n"
+			"because it is referenced by ", ef);
+		ut_print_name(ef, trx, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		goto funct_exit;
+	}
+
+	if (table->n_mysql_handles_opened > 0) {
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table);
+
+	        if (added) {
+			ut_print_timestamp(stderr);
+fputs("	 InnoDB: Warning: MySQL is trying to drop table ", stderr);
+			ut_print_name(stderr, trx, table->name);
+			fputs("\n"
+"InnoDB: though there are still open handles to it.\n"
+"InnoDB: Adding the table to the background drop queue.\n",
+			stderr);
+			
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	they can cope with the table having been dropped here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table);
+
+		if (added) {
+	        	ut_print_timestamp(stderr);
+fputs("	 InnoDB: You are trying to drop table ", stderr);
+			ut_print_name(stderr, trx, table->name);
+			fputs("\n"
+"InnoDB: though there is a foreign key check running on it.\n"
+"InnoDB: Adding the table to the background drop queue.\n",
+			stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+	
+	/* Remove any locks there are on the table or its records */
+	
+	lock_reset_all_on_table(table);
+
+	trx->dict_operation = TRUE;
+	trx->table_id = table->id;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		ut_a(err == DB_OUT_OF_FILE_SPACE);
+
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+		
+		row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		ut_error;
+	} else {
+		ibool		is_path;
+		const char*	name_or_path;
+
+		space_id = table->space;
+		
+		if (table->dir_path_of_temp_table != NULL) {
+			dir_path_of_temp_table =
+				mem_strdup(table->dir_path_of_temp_table);
+			is_path = TRUE;
+			name_or_path = dir_path_of_temp_table;
+		} else {
+			is_path = FALSE;
+			name_or_path = name;
+		}
+
+		dict_table_remove_from_cache(table);
+
+		if (dict_load_table(name) != NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: not able to remove table ",
+				stderr);
+			ut_print_name(stderr, trx, name);
+			fputs(" from the dictionary cache!\n", stderr);
+			err = DB_ERROR;
+		}
+
+		/* Do not drop possible .ibd tablespace if something went
+		wrong: we do not want to delete valuable data of the user */
+
+		if (err == DB_SUCCESS && space_id > 0) {
+			if (!fil_space_for_table_exists_in_mem(space_id,
+								name_or_path,
+								is_path,
+								FALSE, TRUE)) {
+				err = DB_SUCCESS;
+
+				fprintf(stderr,
+"InnoDB: We removed now the InnoDB internal data dictionary entry\n"
+"InnoDB: of table ");	
+				ut_print_name(stderr, trx, name);
+				fprintf(stderr, ".\n");
+
+				goto funct_exit;
+			}
+
+			success = fil_delete_tablespace(space_id);
+
+			if (!success) {
+				fprintf(stderr,
+"InnoDB: We removed now the InnoDB internal data dictionary entry\n"
+"InnoDB: of table ");	
+				ut_print_name(stderr, trx, name);
+				fprintf(stderr, ".\n");
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+"  InnoDB: Error: not able to delete tablespace %lu of table ",
+					(ulong) space_id);
+				ut_print_name(stderr, trx, name);
+				fputs("!\n", stderr);
+				err = DB_ERROR;
+			}
+		}
+	}
+funct_exit:
+
+	if (locked_dictionary) {
+		row_mysql_unlock_data_dictionary(trx);	
+	}
+
+	if (dir_path_of_temp_table) {
+		mem_free(dir_path_of_temp_table);
+	}
+
+	que_graph_free(graph);
+	
+  	trx_commit_for_mysql(trx);
+
+	trx->op_info = "";
+
+#ifndef UNIV_HOTBACKUP
+	srv_wake_master_thread();
+#endif /* !UNIV_HOTBACKUP */
+
+	return((int) err);
+}
+
+/*************************************************************************
+Drops a database for MySQL. */
+
+int
+row_drop_database_for_mysql(
+/*========================*/
+				/* out: error code or DB_SUCCESS */
+	const char*	name,	/* in: database name which ends to '/' */
+	trx_t*		trx)	/* in: transaction handle */
+{
+        dict_table_t* table;
+	char*	table_name;
+	int	err	= DB_SUCCESS;
+	ulint	namelen	= strlen(name);
+	
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(name != NULL);
+	ut_a(name[namelen - 1] == '/');
+	
+	trx->op_info = "dropping database";
+	
+	trx_start_if_not_started(trx);
+loop:
+	row_mysql_lock_data_dictionary(trx);
+
+	while ((table_name = dict_get_first_table_name_in_db(name))) {
+		ut_a(memcmp(table_name, name, namelen) == 0);
+
+		table = dict_table_get_low(table_name);
+
+		ut_a(table);
+
+		/* Wait until MySQL does not have any queries running on
+		the table */
+
+		if (table->n_mysql_handles_opened > 0) {
+			row_mysql_unlock_data_dictionary(trx);
+
+			ut_print_timestamp(stderr);
+			fputs(
+	"  InnoDB: Warning: MySQL is trying to drop database ", stderr);
+			ut_print_name(stderr, trx, name);
+			fputs("\n"
+	"InnoDB: though there are still open handles to table ", stderr);
+			ut_print_name(stderr, trx, table_name);
+			fputs(".\n", stderr);
+
+		        os_thread_sleep(1000000);
+
+		        mem_free(table_name);
+
+		        goto loop;
+		}
+
+		err = row_drop_table_for_mysql(table_name, trx, TRUE);
+
+		mem_free(table_name);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, name);
+			fprintf(stderr, " failed with error %lu for table ",
+				(ulint) err);
+			ut_print_name(stderr, trx, table_name);
+			putc('\n', stderr);
+			break;
+		}
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+	
+	trx_commit_for_mysql(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*************************************************************************
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL. */
+static
+ibool
+row_is_mysql_tmp_table_name(
+/*========================*/
+				/* out: TRUE if temporary table */
+	const char*	name)	/* in: table name in the form
+				'database/tablename' */
+{
+	return(strstr(name, "/#sql") != NULL);
+}
+
+/*************************************************************************
+Renames a table for MySQL. */
+
+int
+row_rename_table_for_mysql(
+/*=======================*/
+					/* out: error code or DB_SUCCESS */
+	const char*	old_name,	/* in: old table name */
+	const char*	new_name,	/* in: new table name */
+	trx_t*		trx)		/* in: transaction handle */
+{
+	dict_table_t*	table;
+	que_thr_t*	thr;
+	que_t*		graph			= NULL;
+	ulint		err;
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+	static const char str1[] =
+	"PROCEDURE RENAME_TABLE_PROC () IS\n"
+	"new_table_name CHAR;\n"
+	"old_table_name CHAR;\n"
+	"gen_constr_prefix CHAR;\n"
+	"new_db_name CHAR;\n"
+	"foreign_id CHAR;\n"
+	"new_foreign_id CHAR;\n"
+	"old_db_name_len INT;\n"
+	"old_t_name_len INT;\n"
+	"new_db_name_len INT;\n"
+	"id_len INT;\n"
+	"found INT;\n"
+	"BEGIN\n"
+	"new_table_name := '";
+	static const char str2[] =
+	"';\nold_table_name := '";
+	static const char str3[] =
+	"';\n"
+	"UPDATE SYS_TABLES SET NAME = new_table_name\n"
+	"WHERE NAME = old_table_name;\n";
+	static const char str4a1[] = /* drop some constraints of tmp tables */
+	"DELETE FROM SYS_FOREIGN_COLS WHERE ID = '";
+	static const char str4a2[] = "';\n"
+	"DELETE FROM SYS_FOREIGN WHERE ID = '";
+	static const char str4a3[] = "';\n";
+	static const char str4b[] = /* rename all constraints */
+	"found := 1;\n"
+	"old_db_name_len := INSTR(old_table_name, '/') - 1;\n"
+	"new_db_name_len := INSTR(new_table_name, '/') - 1;\n"
+	"new_db_name := SUBSTR(new_table_name, 0, new_db_name_len);\n"
+	"old_t_name_len := LENGTH(old_table_name);\n"
+	"gen_constr_prefix := CONCAT(old_table_name, '_ibfk_');\n"
+	"WHILE found = 1 LOOP\n"
+	"	SELECT ID INTO foreign_id\n"
+	"	FROM SYS_FOREIGN\n"
+	"	WHERE FOR_NAME = old_table_name\n"
+	"	      AND TO_BINARY(FOR_NAME) = TO_BINARY(old_table_name);\n"
+	"	IF (SQL % NOTFOUND) THEN\n"
+	"	 found := 0;\n"
+	"	ELSE\n"
+	"	 UPDATE SYS_FOREIGN\n"
+	"	 SET FOR_NAME = new_table_name\n"
+	"	 WHERE ID = foreign_id;\n"
+	"	 id_len := LENGTH(foreign_id);\n"
+	"	 IF (INSTR(foreign_id, '/') > 0) THEN\n"
+	"	 	IF (INSTR(foreign_id,\n"
+	"				gen_constr_prefix) > 0)\n"
+	"		THEN\n"
+	"		  new_foreign_id :=\n"
+	"		    CONCAT(new_table_name,\n"
+	"			SUBSTR(foreign_id, old_t_name_len,\n"
+	"			      	 id_len - old_t_name_len));\n"
+	"		ELSE\n"
+	"		  new_foreign_id :=\n"
+	"		    CONCAT(new_db_name,\n"
+	"			SUBSTR(foreign_id,\n"
+	"				old_db_name_len,\n"
+	"				 id_len - old_db_name_len));\n"
+	"		END IF;\n"
+	"		UPDATE SYS_FOREIGN\n"
+	"		SET ID = new_foreign_id\n"
+	"		WHERE ID = foreign_id;\n"
+	"		UPDATE SYS_FOREIGN_COLS\n"
+	"		SET ID = new_foreign_id\n"
+	"		WHERE ID = foreign_id;\n"
+	"	 END IF;\n"
+	"	END IF;\n"
+	"END LOOP;\n"
+	"UPDATE SYS_FOREIGN SET REF_NAME = new_table_name\n"
+	"WHERE REF_NAME = old_table_name\n"
+	"      AND TO_BINARY(REF_NAME) = TO_BINARY(old_table_name);\n";
+	static const char str5[] =
+	"END;\n";
+
+	mem_heap_t*	heap			= NULL;
+	const char**	constraints_to_drop	= NULL;
+	ulint		n_constraints_to_drop	= 0;
+        ibool           recovering_temp_table   = FALSE;
+	ulint		len;
+	ulint		i;
+        ibool		success;
+	/* length of database name; 0 if not renaming to a temporary table */
+	ulint		db_name_len;
+	char*		sql;
+	char*		sqlend;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		fputs(
+		"InnoDB: A new raw disk partition was initialized or\n"
+		"InnoDB: innodb_force_recovery is on: we do not allow\n"
+		"InnoDB: database modifications by the user. Shut down\n"
+		"InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+		"InnoDB: with raw, and innodb_force_... is removed.\n",
+		stderr);
+
+  		trx_commit_for_mysql(trx);
+		return(DB_ERROR);
+	}
+	
+	if (row_mysql_is_system_table(new_name)) {
+	    	
+		fprintf(stderr,
+    "InnoDB: Error: trying to create a MySQL system table %s of type InnoDB.\n"
+    "InnoDB: MySQL system tables must be of the MyISAM type!\n",
+		new_name);
+
+  		trx_commit_for_mysql(trx);
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "renaming table";
+	trx_start_if_not_started(trx);
+
+	if (row_mysql_is_recovered_tmp_table(new_name)) {
+
+                recovering_temp_table = TRUE;
+        } else {
+		/* Serialize data dictionary operations with dictionary mutex:
+		no deadlocks can occur then in these operations */
+
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	table = dict_table_get_low(old_name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+	    	ut_print_timestamp(stderr);
+
+                fputs("  InnoDB: Error: table ", stderr);
+                ut_print_name(stderr, trx, old_name);
+                fputs(" does not exist in the InnoDB internal\n"
+     	"InnoDB: data dictionary though MySQL is trying to rename the table.\n"
+     	"InnoDB: Have you copied the .frm file of the table to the\n"
+	"InnoDB: MySQL database directory from another database?\n"
+	"InnoDB: You can look for further help from\n"
+        "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+	"InnoDB_troubleshooting_datadict.html\n", stderr);
+		goto funct_exit;
+	}
+
+	if (table->ibd_file_missing) {
+		err = DB_TABLE_NOT_FOUND;
+	    	ut_print_timestamp(stderr);
+
+                fputs("  InnoDB: Error: table ", stderr);
+                ut_print_name(stderr, trx, old_name);
+                fputs(
+	" does not have an .ibd file in the database directory.\n"
+	"InnoDB: You can look for further help from\n"
+        "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+	"InnoDB_troubleshooting_datadict.html\n", stderr);
+		goto funct_exit;
+	}
+
+	/* calculate the length of the SQL string */
+	len = (sizeof str1) + (sizeof str2) + (sizeof str3) + (sizeof str5) - 4
+		+ ut_strlenq(new_name, '\'') + ut_strlenq(old_name, '\'');
+
+	if (row_is_mysql_tmp_table_name(new_name)) {
+		db_name_len = dict_get_db_name_len(old_name) + 1;
+
+		/* MySQL is doing an ALTER TABLE command and it renames the
+		original table to a temporary table name. We want to preserve
+		the original foreign key constraint definitions despite the
+		name change. An exception is those constraints for which
+		the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+		heap = mem_heap_create(100);
+		
+		err = dict_foreign_parse_drop_constraints(heap, trx,
+					table,
+					&n_constraints_to_drop,
+					&constraints_to_drop);
+		if (err != DB_SUCCESS) {
+
+			goto funct_exit;
+		}
+		
+		/* reserve space for all database names */
+		len += 2 * n_constraints_to_drop
+			* (ut_strlenq(old_name, '\'')
+			- ut_strlenq(old_name + db_name_len, '\''));
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			ulint	addlen
+				= 2 * ut_strlenq(constraints_to_drop[i], '\'')
+				+ ((sizeof str4a1) + (sizeof str4a2)
+				+ (sizeof str4a3) - 3);
+			if (!strchr(constraints_to_drop[i], '/')) {
+				addlen *= 2;
+			}
+			len += addlen;
+		}
+	} else {
+		db_name_len = 0;
+		len += (sizeof str4b) - 1;
+	}
+
+	sql = sqlend = mem_alloc(len + 1);
+	memcpy(sql, str1, (sizeof str1) - 1);
+	sqlend += (sizeof str1) - 1;
+	sqlend = ut_strcpyq(sqlend, '\'', new_name);
+	memcpy(sqlend, str2, (sizeof str2) - 1);
+	sqlend += (sizeof str2) - 1;
+	sqlend = ut_strcpyq(sqlend, '\'', old_name);
+	memcpy(sqlend, str3, (sizeof str3) - 1);
+	sqlend += (sizeof str3) - 1;
+
+	if (db_name_len) {
+		/* Internally, old format < 4.0.18 constraints have as the
+		constraint id <number>_<number>, while new format constraints
+		have <databasename>/<constraintname>. */
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			memcpy(sqlend, str4a1, (sizeof str4a1) - 1);
+			sqlend += (sizeof str4a1) - 1;
+			sqlend = ut_memcpyq(sqlend, '\'',
+				old_name, db_name_len);
+			sqlend = ut_strcpyq(sqlend, '\'',
+				constraints_to_drop[i]);
+			memcpy(sqlend, str4a2, (sizeof str4a2) - 1);
+			sqlend += (sizeof str4a2) - 1;
+			sqlend = ut_memcpyq(sqlend, '\'',
+				old_name, db_name_len);
+                        sqlend = ut_strcpyq(sqlend, '\'',
+				constraints_to_drop[i]);
+			memcpy(sqlend, str4a3, (sizeof str4a3) - 1);
+			sqlend += (sizeof str4a3) - 1;
+
+			if (!strchr(constraints_to_drop[i], '/')) {
+				/* If this happens to be an old format
+				constraint, let us delete it. Since all new
+				format constraints contain '/', it does no
+				harm to run these DELETEs anyway. */
+
+				memcpy(sqlend, str4a1, (sizeof str4a1) - 1);
+				sqlend += (sizeof str4a1) - 1;
+				sqlend = ut_strcpyq(sqlend, '\'',
+					constraints_to_drop[i]);
+				memcpy(sqlend, str4a2, (sizeof str4a2) - 1);
+				sqlend += (sizeof str4a2) - 1;
+                        	sqlend = ut_strcpyq(sqlend, '\'',
+					constraints_to_drop[i]);
+				memcpy(sqlend, str4a3, (sizeof str4a3) - 1);
+				sqlend += (sizeof str4a3) - 1;
+			}
+		}
+	}
+	else {
+		memcpy(sqlend, str4b, (sizeof str4b) - 1);
+		sqlend += (sizeof str4b) - 1;
+	}
+
+	memcpy(sqlend, str5, sizeof str5);
+	sqlend += sizeof str5;
+
+	ut_a(sqlend == sql + len + 1);
+	
+	graph = pars_sql(sql);
+
+	ut_a(graph);
+	mem_free(sql);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		if (err == DB_DUPLICATE_KEY) {
+	    		ut_print_timestamp(stderr);
+			fputs(
+     "  InnoDB: Error; possible reasons:\n"
+     "InnoDB: 1) Table rename would cause two FOREIGN KEY constraints\n"
+     "InnoDB: to have the same internal name in case-insensitive comparison.\n"
+     "InnoDB: 2) table ", stderr);
+                ut_print_name(stderr, trx, new_name);
+                fputs(" exists in the InnoDB internal data\n"
+     "InnoDB: dictionary though MySQL is trying rename table ", stderr);
+                ut_print_name(stderr, trx, old_name);
+		fputs(" to it.\n"
+     "InnoDB: Have you deleted the .frm file and not used DROP TABLE?\n"
+     "InnoDB: You can look for further help from\n"
+     "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+     "InnoDB_troubleshooting_datadict.html\n"
+     "InnoDB: If table ", stderr);
+			ut_print_name(stderr, trx, new_name);
+			fputs(
+		" is a temporary table #sql..., then it can be that\n"
+     "InnoDB: there are still queries running on the table, and it will be\n"
+     "InnoDB: dropped automatically when the queries end.\n"
+     "InnoDB: You can drop the orphaned table inside InnoDB by\n"
+     "InnoDB: creating an InnoDB table with the same name in another\n"
+     "InnoDB: database and copying the .frm file to the current database.\n"
+     "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n"
+     "InnoDB: succeed.\n", stderr);
+		}
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		/* The following call will also rename the .ibd data file if
+		the table is stored in a single-table tablespace */
+
+		success = dict_table_rename_in_cache(table, new_name,
+				!row_is_mysql_tmp_table_name(new_name));
+		if (!success) {
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, FALSE, NULL);
+			trx->error_state = DB_SUCCESS;
+			ut_print_timestamp(stderr);
+			fputs(" InnoDB: Error in table rename, cannot rename ",
+				stderr);
+			ut_print_name(stderr, trx, old_name);
+			fputs(" to ", stderr);
+			ut_print_name(stderr, trx, new_name);
+			putc('\n', stderr);
+			err = DB_ERROR;
+
+			goto funct_exit;
+		}
+
+		err = dict_load_foreigns(new_name, trx->check_foreigns);
+
+		if (row_is_mysql_tmp_table_name(old_name)) {
+
+			/* MySQL is doing an ALTER TABLE command and it
+			renames the created temporary table to the name
+			of the original table. In the ALTER TABLE we maybe
+			created some FOREIGN KEY constraints for the temporary
+			table. But we want to load also the foreign key
+			constraint definitions for the original table name. */
+
+			if (err != DB_SUCCESS) {
+	    			ut_print_timestamp(stderr);
+				fputs("  InnoDB: Error: in ALTER TABLE ",
+					stderr);
+				ut_print_name(stderr, trx, new_name);
+				fputs("\n"
+	"InnoDB: has or is referenced in foreign key constraints\n"
+	"InnoDB: which are not compatible with the new table definition.\n",
+					stderr);
+
+				ut_a(dict_table_rename_in_cache(table,
+					old_name, FALSE));
+				trx->error_state = DB_SUCCESS;
+				trx_general_rollback_for_mysql(trx, FALSE,
+									NULL);
+				trx->error_state = DB_SUCCESS;
+			}
+		} else {
+			if (err != DB_SUCCESS) {
+
+	    			ut_print_timestamp(stderr);
+
+				fputs(
+				"  InnoDB: Error: in RENAME TABLE table ",
+					stderr);
+				ut_print_name(stderr, trx, new_name);
+				fputs("\n"
+     "InnoDB: is referenced in foreign key constraints\n"
+     "InnoDB: which are not compatible with the new table definition.\n",
+					stderr);
+     
+				ut_a(dict_table_rename_in_cache(table,
+					old_name, FALSE));
+						
+				trx->error_state = DB_SUCCESS;
+				trx_general_rollback_for_mysql(trx, FALSE,
+									NULL);
+				trx->error_state = DB_SUCCESS;
+			}
+		}
+	}
+funct_exit:	
+	if (!recovering_temp_table) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	if (graph) {
+		que_graph_free(graph);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	
+  	trx_commit_for_mysql(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*************************************************************************
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction. */
+static
+ibool
+row_scan_and_check_index(
+/*=====================*/
+					/* out: TRUE if ok */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct in MySQL */
+	dict_index_t*	index,		/* in: index */
+	ulint*		n_rows)		/* out: number of entries seen in the
+					current consistent read */
+{
+	dtuple_t*	prev_entry	= NULL;
+	ulint		matched_fields;
+	ulint		matched_bytes;
+	byte*		buf;
+	ulint		ret;
+	rec_t*		rec;
+	ibool		is_ok		= TRUE;
+	int		cmp;
+	ibool		contains_null;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	*n_rows = 0;
+	
+	buf = mem_alloc(UNIV_PAGE_SIZE);
+	heap = mem_heap_create(100);
+	
+	/* Make a dummy template in prebuilt, which we will use
+	in scanning the index entries */
+
+	prebuilt->index = index;
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+	prebuilt->n_template = 0;
+	prebuilt->need_to_access_clustered = FALSE;
+
+ 	dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+	prebuilt->select_lock_type = LOCK_NONE;
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+	if (ret != DB_SUCCESS) {
+
+		mem_free(buf);
+		mem_heap_free(heap);
+
+		return(is_ok);
+	}
+
+	*n_rows = *n_rows + 1;
+	
+	/* row_search... returns the index record in buf, record origin offset
+	within buf stored in the first 4 bytes, because we have built a dummy
+	template */
+	
+	rec = buf + mach_read_from_4(buf);
+
+	if (prev_entry != NULL) {
+		matched_fields = 0;
+		matched_bytes = 0;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+		cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+						&matched_fields,
+						&matched_bytes);
+		contains_null = FALSE;
+
+		/* In a unique secondary index we allow equal key values if
+		they contain SQL NULLs */
+
+	        for (i = 0;
+                     i < dict_index_get_n_ordering_defined_by_user(index);
+		     i++) {
+	                if (UNIV_SQL_NULL == dfield_get_len(
+                                      dtuple_get_nth_field(prev_entry, i))) {
+
+                        	contains_null = TRUE;
+	                }
+	        }
+
+		if (cmp > 0) {
+			fputs("InnoDB: index records in a wrong order in ",
+				stderr);
+		not_ok:
+			dict_index_name_print(stderr,
+					prebuilt->trx, index);
+			fputs("\n"
+				"InnoDB: prev record ", stderr);
+			dtuple_print(stderr, prev_entry);
+			fputs("\n"
+				"InnoDB: record ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			is_ok = FALSE;
+		} else if ((index->type & DICT_UNIQUE)
+			   && !contains_null
+			   && matched_fields >=
+			   dict_index_get_n_ordering_defined_by_user(index)) {
+
+			fputs("InnoDB: duplicate key in ", stderr);
+			goto not_ok;
+		}
+	}
+
+	mem_heap_empty(heap);
+	offsets = offsets_;
+	
+	prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+	goto loop;	
+}
+
+/*************************************************************************
+Checks a table for corruption. */
+
+ulint
+row_check_table_for_mysql(
+/*======================*/
+					/* out: DB_ERROR or DB_SUCCESS */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table		= prebuilt->table;
+	dict_index_t*	index;
+	ulint		n_rows;
+	ulint		n_rows_in_table	= ULINT_UNDEFINED;
+	ulint		ret 		= DB_SUCCESS;
+	ulint		old_isolation_level;
+
+	if (prebuilt->table->ibd_file_missing) {
+	        ut_print_timestamp(stderr);
+	        fprintf(stderr, "  InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+				prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	prebuilt->trx->op_info = "checking table";
+
+	old_isolation_level = prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we use always
+	REPEATABLE READ here */
+
+	prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	/* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		/* fputs("Validating index ", stderr);
+		ut_print_name(stderr, index->name);
+		putc('\n', stderr); */
+	
+		if (!btr_validate_tree(index->tree)) {
+			ret = DB_ERROR;
+		} else {
+			if (!row_scan_and_check_index(prebuilt,
+							index, &n_rows)) {
+				ret = DB_ERROR;
+			}
+
+			/* fprintf(stderr, "%lu entries in index %s\n", n_rows,
+			  index->name); */
+
+			if (index == dict_table_get_first_index(table)) {
+				n_rows_in_table = n_rows;
+			} else if (n_rows != n_rows_in_table) {
+
+				ret = DB_ERROR;
+ 
+				fputs("Error: ", stderr);
+				dict_index_name_print(stderr,
+					prebuilt->trx, index);
+				fprintf(stderr,
+				" contains %lu entries, should be %lu\n",
+					(ulong) n_rows,
+					(ulong) n_rows_in_table);
+			}
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	/* Restore the original isolation level */
+	prebuilt->trx->isolation_level = old_isolation_level;
+	
+	/* We validate also the whole adaptive hash index for all tables
+	at every CHECK TABLE */
+
+	if (!btr_search_validate()) {
+
+		ret = DB_ERROR;
+	}
+
+	/* Restore the fatal lock wait timeout after CHECK TABLE. */
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	prebuilt->trx->op_info = "";
+
+	return(ret);
+}
diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.c
new file mode 100644
index 00000000000..5893e016011
--- /dev/null
+++ b/storage/innobase/row/row0purge.c
@@ -0,0 +1,671 @@
+/******************************************************
+Purge obsolete records
+
+(c) 1997 Innobase Oy
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+
+/************************************************************************
+Creates a purge node to a query graph. */
+
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+				/* out, own: purge node */
+	que_thr_t*	parent,	/* in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/* in: memory heap where created */
+{
+	purge_node_t*	node;
+
+	ut_ad(parent && heap);
+
+	node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+	node->common.type = QUE_NODE_PURGE;
+	node->common.parent = parent;
+
+	node->heap = mem_heap_create(256);
+
+	return(node);
+}
+
+/***************************************************************
+Repositions the pcur in the purge node on the clustered index record,
+if found. */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+				/* out: TRUE if the record was found */
+	ulint		mode,	/* in: latching mode */
+	purge_node_t*	node,	/* in: row purge node */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ibool	found;
+
+	if (node->found_clust) {
+		found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+		return(found);
+	}
+
+	found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+							node->ref, mtr);
+	node->found_clust = found;
+
+	if (found) {
+		btr_pcur_store_position(&(node->pcur), mtr);
+	}
+
+	return(found);
+}
+
+/***************************************************************
+Removes a delete marked clustered index record if possible. */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+				/* out: TRUE if success, or if not found, or
+				if modified after the delete marking */
+	purge_node_t*	node,	/* in: row purge node */
+	ulint		mode)	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	index = dict_table_get_first_index(node->table);
+	
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	mtr_start(&mtr);
+
+	success = row_purge_reposition_pcur(mode, node, &mtr);
+
+	if (!success) {
+		/* The record is already removed */
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (0 != ut_dulint_cmp(node->roll_ptr,
+		row_get_rec_roll_ptr(rec, index, rec_get_offsets(
+			rec, index, offsets_, ULINT_UNDEFINED, &heap)))) {
+		if (heap) {
+			mem_heap_free(heap);
+		}
+		/* Someone else has modified the record later: do not remove */
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, FALSE, &mtr);
+
+		if (err == DB_SUCCESS) {
+			success = TRUE;
+		} else if (err == DB_OUT_OF_FILE_SPACE) {
+			success = FALSE;
+		} else {
+			ut_error;
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	return(success);
+}
+		
+/***************************************************************
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node)	/* in: row purge node */
+{
+	ibool	success;
+	ulint	n_tries	= 0;
+	
+/*	fputs("Purge: Removing clustered record\n", stderr); */
+
+	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+			
+		goto retry;
+	}
+
+	ut_a(success);
+}
+ 						
+/***************************************************************
+Removes a secondary index entry if possible. */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+				/* out: TRUE if success or if not found */
+	purge_node_t*	node,	/* in: row purge node */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry */
+	ulint		mode)	/* in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */	
+{
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has = 0; /* remove warning */
+	ibool		found;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t*		mtr_vers;
+	
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	if (!found) {
+		/* Not found */
+
+		/* fputs("PURGE:........sec entry not found\n", stderr); */
+		/* dtuple_print(entry); */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+	
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+	mtr_vers = mem_alloc(sizeof(mtr_t));
+	
+	mtr_start(mtr_vers);
+
+	success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, mtr_vers);
+
+	if (success) {		
+		old_has = row_vers_old_has_index_entry(TRUE,
+					btr_pcur_get_rec(&(node->pcur)),
+					mtr_vers, index, entry);
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), mtr_vers);
+
+	mem_free(mtr_vers);
+	
+	if (!success || !old_has) {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {		
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+							FALSE, &mtr);
+			if (err == DB_SUCCESS) {
+				success = TRUE;
+			} else if (err == DB_OUT_OF_FILE_SPACE) {
+				success = FALSE;
+			} else {
+				ut_error;
+			}
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/* in: row purge node */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+	
+/*	fputs("Purge: Removing secondary record\n", stderr); */
+
+	success = row_purge_remove_sec_if_poss_low(node, index, entry,
+							BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_low(node, index, entry,
+							BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+				
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+							
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***************************************************************
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node)	/* in: row purge node */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	
+	ut_ad(node);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		/* Build the index entry */
+		entry = row_build_index_entry(node->row, index, heap);
+
+		row_purge_remove_sec_if_poss(node, index, entry);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);	
+
+	row_purge_remove_clust_if_poss(node);
+}
+	
+/***************************************************************
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern(
+/*==========================*/
+	purge_node_t*	node)	/* in: row purge node */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	upd_field_t*	ufield;
+	ibool		is_insert;
+	ulint		rseg_id;
+	ulint		page_no;
+	ulint		offset;
+	ulint		internal_offset;
+	byte*		data_field;
+	ulint		data_field_len;
+	ulint		i;
+	mtr_t		mtr;
+	
+	ut_ad(node);
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+		goto skip_secondaries;
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field_binary(NULL, node->index,
+							node->update)) {
+			/* Build the older version of the index entry */
+			entry = row_build_index_entry(node->row, index, heap);
+
+			row_purge_remove_sec_if_poss(node, index, entry);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);	
+
+skip_secondaries:
+	/* Free possible externally stored fields */
+	for (i = 0; i < upd_get_n_fields(node->update); i++) {
+
+		ufield = upd_get_nth_field(node->update, i);
+
+		if (ufield->extern_storage) {
+			/* We use the fact that new_val points to
+			node->undo_rec and get thus the offset of
+			dfield data inside the unod record. Then we
+			can calculate from node->roll_ptr the file
+			address of the new_val data */
+
+			internal_offset = ((byte*)ufield->new_val.data)
+						- node->undo_rec;
+						
+			ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+			trx_undo_decode_roll_ptr(node->roll_ptr,
+						&is_insert, &rseg_id,
+						&page_no, &offset);
+			mtr_start(&mtr);
+
+			/* We have to acquire an X-latch to the clustered
+			index tree */
+
+			index = dict_table_get_first_index(node->table);
+
+			mtr_x_lock(dict_tree_get_lock(index->tree), &mtr);
+
+			/* NOTE: we must also acquire an X-latch to the
+			root page of the tree. We will need it when we
+			free pages from the tree. If the tree is of height 1,
+			the tree X-latch does NOT protect the root page,
+			because it is also a leaf page. Since we will have a
+			latch on an undo log page, we would break the
+			latching order if we would only later latch the
+			root page of such a tree! */
+			
+			btr_root_get(index->tree, &mtr);
+
+			/* We assume in purge of externally stored fields
+			that the space id of the undo log record is 0! */
+
+			data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr)
+				     + offset + internal_offset;
+
+#ifdef UNIV_SYNC_DEBUG
+			buf_page_dbg_add_level(buf_frame_align(data_field),
+						SYNC_TRX_UNDO_PAGE);
+#endif /* UNIV_SYNC_DEBUG */
+				     
+			data_field_len = ufield->new_val.len;
+
+			btr_free_externally_stored_field(index, data_field,
+						data_field_len, FALSE, &mtr);
+			mtr_commit(&mtr);
+		}
+	}
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+				/* out: TRUE if purge operation required:
+				NOTE that then the CALLER must unfreeze
+				data dictionary! */
+	purge_node_t*	node,	/* in: row undo node */
+	ibool*		updated_extern,
+				/* out: TRUE if an externally stored field
+				was updated */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	trx_t*		trx;
+	dulint		undo_no;
+	dulint		table_id;
+	dulint		trx_id;
+	dulint		roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	
+	ut_ad(node && thr);
+
+	trx = thr_get_trx(thr);
+	
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+					updated_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+
+		return(FALSE);
+	}	    		
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+								&info_bits);
+	node->table = NULL;
+
+	if (type == TRX_UNDO_UPD_EXIST_REC
+	    && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
+
+	    	/* Purge requires no changes to indexes: we may return */
+
+	    	return(FALSE);
+	}
+	
+	/* Prevent DROP TABLE etc. from running when we are doing the purge
+	for this row */
+
+	row_mysql_freeze_data_dictionary(trx);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	node->table = dict_table_get_on_id_low(table_id, trx);
+	
+	mutex_exit(&(dict_sys->mutex));
+
+	if (node->table == NULL) {
+		/* The table has been dropped: no need to do purge */
+
+		row_mysql_unfreeze_data_dictionary(trx);
+
+		return(FALSE);
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip purge of missing .ibd files */
+
+		node->table = NULL;
+
+		row_mysql_unfreeze_data_dictionary(trx);
+
+		return(FALSE);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	if (clust_index == NULL) {
+		/* The table was corrupt in the data dictionary */
+
+		row_mysql_unfreeze_data_dictionary(trx);
+
+		return(FALSE);
+	}
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+								node->heap);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					roll_ptr, info_bits, trx,
+					node->heap, &(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	if (!cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		ptr = trx_undo_rec_get_partial_row(ptr, clust_index,
+						&(node->row), node->heap);
+	}
+	
+	return(TRUE);
+}
+
+/***************************************************************
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_purge(
+/*======*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code */
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dulint	roll_ptr;
+	ibool	purge_needed;
+	ibool	updated_extern;
+	trx_t*	trx;
+	
+	ut_ad(node && thr);
+
+	trx = thr_get_trx(thr);
+	
+	node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+						&(node->reservation),
+						node->heap);
+	if (!node->undo_rec) {
+		/* Purge completed for this query thread */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(DB_SUCCESS);
+	}
+
+	node->roll_ptr = roll_ptr;
+
+	if (node->undo_rec == &trx_purge_dummy_rec) {
+		purge_needed = FALSE;
+	} else {
+		purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
+									thr);
+		/* If purge_needed == TRUE, we must also remember to unfreeze
+		data dictionary! */
+	}
+
+	if (purge_needed) {
+		node->found_clust = FALSE;
+	
+		node->index = dict_table_get_next_index(
+				dict_table_get_first_index(node->table));
+
+		if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+			row_purge_del_mark(node);
+
+		} else if (updated_extern
+			    || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+			row_purge_upd_exist_or_extern(node);
+		}
+
+		if (node->found_clust) {
+			btr_pcur_close(&(node->pcur));
+		}
+
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	trx_purge_rec_release(node->reservation);
+	mem_heap_empty(node->heap);
+	
+	thr->run_node = node;
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph. */
+
+que_thr_t*
+row_purge_step(
+/*===========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	purge_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+	err = row_purge(node, thr);
+
+	ut_ad(err == DB_SUCCESS);
+
+	return(thr);
+} 
diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c
new file mode 100644
index 00000000000..a6d3f1d5ab0
--- /dev/null
+++ b/storage/innobase/row/row0row.c
@@ -0,0 +1,730 @@
+/******************************************************
+General row routines
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+
+/*************************************************************************
+Reads the trx id or roll ptr field from a clustered index record: this function
+is slower than the specialized inline functions. */
+
+dulint
+row_get_rec_sys_field(
+/*==================*/
+				/* out: value of the field */
+	ulint		type,	/* in: DATA_TRX_ID or DATA_ROLL_PTR */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: clustered index */
+	const ulint*	offsets)/* in: rec_get_offsets(rec, index) */
+{
+	ulint		pos;
+	byte*		field;
+	ulint		len;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	field = rec_get_nth_field(rec, offsets, pos, &len);
+
+	if (type == DATA_TRX_ID) {
+
+		return(trx_read_trx_id(field));
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+
+		return(trx_read_roll_ptr(field));
+	}
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field in a clustered index record: this function
+is slower than the specialized inline functions. */
+
+void
+row_set_rec_sys_field(
+/*==================*/
+				/* out: value of the field */
+	ulint		type,	/* in: DATA_TRX_ID or DATA_ROLL_PTR */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: clustered index */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
+	dulint		val)	/* in: value to set */
+{
+	ulint	pos;
+	byte*	field;
+	ulint	len;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	field = rec_get_nth_field(rec, offsets, pos, &len);
+
+	if (type == DATA_TRX_ID) {
+
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/*********************************************************************
+When an insert to a table is performed, this function builds the entry which
+has to be inserted to an index on the table. */
+
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+				/* out: index entry which should be inserted */
+	dtuple_t*	row, 	/* in: row which should be inserted to the
+				table */
+	dict_index_t*	index, 	/* in: index on the table */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory for
+				the index entry is allocated */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	dict_field_t*	ind_field;
+	dfield_t*	dfield;
+	dfield_t*	dfield2;
+	dict_col_t*	col;
+	ulint		i;
+        ulint           storage_len;
+	dtype_t*	cur_type;
+
+	ut_ad(row && index && heap);
+	ut_ad(dtuple_check_typed(row));
+	
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	if (index->type & DICT_UNIVERSAL) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+	} else {
+		dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	}
+
+	for (i = 0; i < entry_len; i++) {
+		ind_field = dict_index_get_nth_field(index, i);
+		col = ind_field->col;
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+		dfield_copy(dfield, dfield2);
+
+		/* If a column prefix index, take only the prefix */
+		if (ind_field->prefix_len > 0
+		    && dfield_get_len(dfield2) != UNIV_SQL_NULL) {
+			
+			cur_type = dict_col_get_type(
+				dict_field_get_col(ind_field));
+
+			storage_len = dtype_get_at_most_n_mbchars(
+				cur_type,
+				ind_field->prefix_len,
+				dfield_get_len(dfield2), dfield2->data);
+
+			dfield_set_len(dfield, storage_len);
+		}
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}			
+
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+dtuple_t*
+row_build(
+/*======*/
+				/* out, own: row built; see the NOTE below! */
+	ulint		type,	/* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or
+				ROW_COPY_ALSO_EXTERNALS, 
+				the two last copy also the data fields to
+				heap as the first only places pointers to
+				data fields on the index page, and thus is
+				more efficient */
+	dict_index_t*	index,	/* in: clustered index */
+	rec_t*		rec,	/* in: record in the clustered index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row dtuple is used! */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index)
+				or NULL, in which case this function
+				will invoke rec_get_offsets() */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory
+				needed is allocated */
+{
+	dtuple_t*	row;
+	dict_table_t*	table;
+	dict_field_t*	ind_field;
+	dict_col_t*	col;
+	dfield_t*	dfield;
+	ulint		n_fields;
+	byte*		field;
+	ulint		len;
+	ulint		row_len;
+	byte*		buf; 
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(index && rec && heap);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					ULINT_UNDEFINED, &tmp_heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	if (type != ROW_COPY_POINTERS) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, (ulint*) offsets);
+	}
+
+	table = index->table;
+	row_len = dict_table_get_n_cols(table);
+
+	row = dtuple_create(heap, row_len);
+
+	dtuple_set_info_bits(row, rec_get_info_bits(rec, table->comp));
+
+	n_fields = rec_offs_n_fields(offsets);
+
+	dict_table_copy_types(row, table);
+
+	for (i = 0; i < n_fields; i++) {
+	        ind_field = dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len == 0) {
+
+		        col = dict_field_get_col(ind_field);
+			dfield = dtuple_get_nth_field(row,
+						dict_col_get_no(col));
+			field = rec_get_nth_field(rec, offsets, i, &len);
+
+			if (type == ROW_COPY_ALSO_EXTERNALS
+			    && rec_offs_nth_extern(offsets, i)) {
+
+			        field = btr_rec_copy_externally_stored_field(
+						rec, offsets, i, &len, heap);
+			}
+
+			dfield_set_data(dfield, field, len);
+		}
+	}
+
+	ut_ad(dtuple_check_typed(row));
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(row);
+}
+
+/***********************************************************************
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap. */
+
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+				/* out, own: index entry built; see the
+				NOTE below! */
+	ulint		type,	/* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap as the latter only places pointers to
+				data fields on the index page */
+	dict_index_t*	index,	/* in: index */
+	rec_t*		rec,	/* in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the dtuple is used! */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory
+				needed is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	ulint		i;
+	byte*		field;
+	ulint		len;
+	ulint		rec_len;
+	byte*		buf;
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(rec && heap && index);
+	
+	offsets = rec_get_offsets(rec, index, offsets,
+					ULINT_UNDEFINED, &tmp_heap);
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	rec_len = rec_offs_n_fields(offsets);
+	
+	entry = dtuple_create(heap, rec_len);
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(rec_len == dict_index_get_n_fields(index));
+
+	dict_index_copy_types(entry, index, rec_len);
+
+	dtuple_set_info_bits(entry,
+			rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+	for (i = 0; i < rec_len; i++) {
+
+		dfield = dtuple_get_nth_field(entry, i);
+		field = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(entry);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+				/* out, own: row reference built; see the
+				NOTE below! */
+	ulint		type,	/* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/* in: index */
+	rec_t*		rec,	/* in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dtuple_t*	ref;
+	byte*		field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(index && rec && heap);
+
+	offsets = rec_get_offsets(rec, index, offsets,
+					ULINT_UNDEFINED, &tmp_heap);
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	table = index->table;
+	
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+	
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len =
+			dict_index_get_nth_field(clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+		    	if (len != UNIV_SQL_NULL) {
+
+				dfield_set_len(dfield,
+				  dtype_get_at_most_n_mbchars(
+					dfield_get_type(dfield),
+					clust_col_prefix_len, len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(ref);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*	ref,	/* in/out: row reference built; see the
+				NOTE below! */
+	dict_index_t*	index,	/* in: index */
+	rec_t*		rec,	/* in: record in the index;
+				NOTE: the data fields in ref will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	trx_t*		trx)	/* in: transaction */
+{
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_a(ref && index && rec);
+	
+	if (!index->table) {
+		fputs("InnoDB: table ", stderr);
+	notfound:
+		ut_print_name(stderr, trx, index->table_name);
+		fputs(" for index ", stderr);
+		ut_print_name(stderr, trx, index->name);
+		fputs(" not found\n", stderr);
+		ut_error;
+	}
+	
+	clust_index = dict_table_get_first_index(index->table);
+	
+	if (!clust_index) {
+		fputs("InnoDB: clust index for table ", stderr);
+		goto notfound;
+	}
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+	
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len =
+			dict_index_get_nth_field(clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+		    	if (len != UNIV_SQL_NULL) {
+
+				dfield_set_len(dfield,
+				  dtype_get_at_most_n_mbchars(
+					dfield_get_type(dfield),
+					clust_col_prefix_len, len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (heap) {
+		mem_heap_free(heap);
+	}
+}
+
+/***********************************************************************
+From a row build a row reference with which we can search the clustered
+index record. */
+
+void
+row_build_row_ref_from_row(
+/*=======================*/
+	dtuple_t*	ref,	/* in/out: row reference built; see the
+				NOTE below! ref must have the right number
+				of fields! */
+	dict_table_t*	table,	/* in: table */
+	dtuple_t*	row)	/* in: row
+				NOTE: the data fields in ref will point
+				directly into data of this row */
+{
+	dict_index_t*	clust_index;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	dfield_t*	dfield2;
+	dict_col_t*	col;
+	ulint		ref_len;
+	ulint		i;
+	dtype_t*	cur_type;
+	
+	ut_ad(ref && table && row);
+		
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+	
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+		
+		field = dict_index_get_nth_field(clust_index, i);
+		
+		col = dict_field_get_col(field);
+				
+		dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+		dfield_copy(dfield, dfield2);
+
+		if (field->prefix_len > 0
+		    && dfield->len != UNIV_SQL_NULL) {
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(field));
+
+			dfield->len = dtype_get_at_most_n_mbchars(
+				cur_type,
+				field->prefix_len,
+				dfield->len, dfield->data);
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+}
+
+/*******************************************************************
+Searches the clustered index record for a row, if we have the row reference. */
+
+ibool
+row_search_on_row_ref(
+/*==================*/
+				/* out: TRUE if found */
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor, which must
+				be closed by the caller */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF, ... */
+	dict_table_t*	table,	/* in: table */
+	dtuple_t*	ref,	/* in: row reference */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ulint		low_match;	
+	rec_t*		rec;
+	dict_index_t*	index;
+	page_t*		page;	
+
+	ut_ad(dtuple_check_typed(ref));
+
+	index = dict_table_get_first_index(table);
+
+	ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index));
+
+	btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+	
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+	page = buf_frame_align(rec);
+
+	if (rec == page_get_infimum_rec(page)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != dtuple_get_n_fields(ref)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved. */
+
+rec_t*
+row_get_clust_rec(
+/*==============*/
+				/* out: record or NULL, if no record found */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF, ... */
+	rec_t*		rec,	/* in: record in a secondary index */
+	dict_index_t*	index,	/* in: secondary index */
+	dict_index_t**	clust_index,/* out: clustered index */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	ibool		found;
+	rec_t*		clust_rec;
+	
+	ut_ad((index->type & DICT_CLUSTERED) == 0);
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+	mem_heap_free(heap);
+
+	btr_pcur_close(&pcur);
+
+	*clust_index = dict_table_get_first_index(table);
+
+	return(clust_rec);
+}
+
+/*******************************************************************
+Searches an index record. */
+
+ibool
+row_search_index_entry(
+/*===================*/
+				/* out: TRUE if found */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	page_t*	page;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+	
+	btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+	page = buf_frame_align(rec);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	if (rec == page_get_infimum_rec(page)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != n_fields) {
+		/* Not found */
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.c
new file mode 100644
index 00000000000..94cf82d6a3d
--- /dev/null
+++ b/storage/innobase/row/row0sel.c
@@ -0,0 +1,4066 @@
+/*******************************************************
+Select
+
+(c) 1997 Innobase Oy
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "read0read.h"
+#include "buf0lru.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/************************************************************************
+Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation! */
+static
+ibool
+row_sel_sec_rec_is_for_clust_rec(
+/*=============================*/
+					/* out: TRUE if the secondary
+					record is equal to the corresponding
+					fields in the clustered record,
+					when compared with collation */
+	rec_t*		sec_rec,	/* in: secondary index record */
+	dict_index_t*	sec_index,	/* in: secondary index */
+	rec_t*		clust_rec,	/* in: clustered index record */
+	dict_index_t*	clust_index)	/* in: clustered index */
+{
+	dict_field_t*	ifield;
+        dict_col_t*     col;
+        byte*           sec_field;
+        ulint           sec_len;
+        byte*           clust_field;
+        ulint           clust_len;
+        ulint           n;
+        ulint           i;
+	dtype_t*	cur_type;
+	mem_heap_t*	heap		= NULL;
+	ulint		clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint		sec_offsets_[REC_OFFS_SMALL_SIZE];
+	ulint*		clust_offs	= clust_offsets_;
+	ulint*		sec_offs	= sec_offsets_;
+	ibool		is_equal	= TRUE;
+
+	*clust_offsets_ = (sizeof clust_offsets_) / sizeof *clust_offsets_;
+	*sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_;
+
+	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+						ULINT_UNDEFINED, &heap);
+	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+						ULINT_UNDEFINED, &heap);
+
+        n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+        for (i = 0; i < n; i++) {
+		ifield = dict_index_get_nth_field(sec_index, i);
+                col = dict_field_get_col(ifield);
+                
+		clust_field = rec_get_nth_field(clust_rec, clust_offs,
+                                                dict_col_get_clust_pos(col),
+                                                &clust_len);
+		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+		if (ifield->prefix_len > 0
+		    && clust_len != UNIV_SQL_NULL) {
+
+			cur_type = dict_col_get_type(
+				dict_field_get_col(ifield));
+
+			clust_len = dtype_get_at_most_n_mbchars(
+				cur_type,
+				ifield->prefix_len,
+				clust_len, (char*) clust_field);
+		}
+
+                if (0 != cmp_data_data(dict_col_get_type(col),
+                                        clust_field, clust_len,
+                                        sec_field, sec_len)) {
+			is_equal = FALSE;
+			goto func_exit;
+                }
+        }
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(is_equal);
+}
+
+/*************************************************************************
+Creates a select node struct. */
+
+sel_node_t*
+sel_node_create(
+/*============*/
+				/* out, own: select node struct */
+	mem_heap_t*	heap)	/* in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(sel_node_t));
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->select_will_do_update = FALSE;
+	node->latch_mode = BTR_SEARCH_LEAF;
+
+	node->plans = NULL;
+	
+	return(node);
+}
+
+/*************************************************************************
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/* in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*************************************************************************
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/* in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*************************************************************************
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/* in: first variable in a list of variables */
+	sel_node_t*	node)	/* in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	exp = node->select_list;
+
+	while (var) {
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+		var = que_node_get_next(var);
+	}
+}
+
+/*************************************************************************
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/* in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	func_node = node->select_list;
+
+	while (func_node) {
+		eval_node_set_int_val(func_node, 0);
+
+		func_node = que_node_get_next(func_node);
+	}	
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*************************************************************************
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/* in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*************************************************************************
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/* in: record index */
+	rec_t*		rec,	/* in: record in a clustered or non-clustered
+				index */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
+	sym_node_t*	column)	/* in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	byte*		data;
+	ulint		len;
+	
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (index->type & DICT_CLUSTERED) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+	
+			data = rec_get_nth_field(rec, offsets, field_no, &len);
+			
+			if (column->copy_val) {
+				eval_node_copy_and_alloc_val(column, data,
+									len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*************************************************************************
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/* in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+	
+	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+							* sizeof(sel_buf_t));
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*************************************************************************
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/* in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			mem_free(sel_buf->data);
+		}
+	}
+}
+
+/*************************************************************************
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+	plan_t*	plan)	/* in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+	
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+#ifdef UNIV_DEBUG
+			dfield_set_data(val, NULL, 0);
+#endif
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = dfield_get_data(val);
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+		
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*************************************************************************
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+	plan_t*	plan)	/* in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+		
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+	
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+	
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+
+			goto next_col;
+		}
+		
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = dfield_get_data(val);
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+		
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+next_col:		
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+					/* out: DB_SUCCESS or error code */
+	read_view_t*	read_view,	/* in: read view */
+	plan_t*		plan,		/* in: plan node for table */
+	rec_t*		rec,		/* in: record in a clustered index */
+	ulint**		offsets,	/* in/out: offsets returned by
+					rec_get_offsets(rec, plan->index) */
+	mem_heap_t**	offset_heap,	/* in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/* out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	ulint	err;
+
+	if (plan->old_vers_heap) {
+		mem_heap_empty(plan->old_vers_heap);
+	} else {
+		plan->old_vers_heap = mem_heap_create(512);
+	}
+	
+	err = row_vers_build_for_consistent_read(rec, mtr, plan->index,
+					offsets, read_view, offset_heap,
+					plan->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*************************************************************************
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted. */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ 			/* out: TRUE if row passed the tests */
+	plan_t*	plan)	/* in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+	
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(cond->args);
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Tests the other conditions. */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+			/* out: TRUE if row passed the tests */
+	plan_t*	plan)	/* in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+	
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+				/* out: DB_SUCCESS or error code */
+	sel_node_t*	node,	/* in: select_node */
+	plan_t*		plan,	/* in: plan node for table */
+	rec_t*		rec,	/* in: record in a non-clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	rec_t**		out_rec,/* out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/* in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	offsets = rec_get_offsets(rec,
+				btr_pcur_get_btr_cur(&plan->pcur)->index,
+				offsets, ULINT_UNDEFINED, &heap);
+	
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+	index = dict_table_get_first_index(plan->table);
+	
+	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+				node->latch_mode, &(plan->clust_pcur),
+				0, mtr);
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+            || btr_pcur_get_low_match(&(plan->clust_pcur))
+	       < dict_index_get_n_unique(index)) {
+	
+		ut_a(rec_get_deleted_flag(rec, plan->table->comp));
+		ut_a(node->read_view);
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.c
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		clust_rec = NULL;
+
+		goto func_exit;
+	}
+
+	offsets = rec_get_offsets(clust_rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+        
+		/* If innodb_locks_unsafe_for_binlog option is used, 
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		ulint	lock_type;
+
+		if (srv_locks_unsafe_for_binlog) {
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = lock_clust_rec_read_check_and_lock(0,
+				clust_rec, index, offsets,
+				node->row_lock_mode, lock_type, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+							node->read_view)) {
+
+			err = row_sel_build_prev_vers(node->read_view, plan,
+						clust_rec, &offsets, &heap,
+						&old_vers, mtr);
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				goto func_exit;
+			}
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+		
+		if ((old_vers || rec_get_deleted_flag(rec, plan->table->comp))
+		    && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
+							clust_rec, index)) {
+			clust_rec = NULL;
+			goto func_exit;
+		}
+	}
+
+	/* Fetch the columns needed in test conditions */
+
+	row_sel_fetch_columns(index, clust_rec, offsets,
+					UT_LIST_GET_FIRST(plan->columns));
+func_exit:
+	*out_rec = clust_rec;
+	err = DB_SUCCESS;
+err_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*************************************************************************
+Sets a lock on a record. */
+UNIV_INLINE
+ulint
+sel_set_rec_lock(
+/*=============*/
+				/* out: DB_SUCCESS or error code */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: index */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
+	ulint		mode,	/* in: lock mode */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */
+	que_thr_t*	thr)	/* in: query thread */	
+{
+	trx_t*	trx;
+	ulint	err;
+
+	trx = thr_get_trx(thr);	
+
+	if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+		if (buf_LRU_buf_pool_running_out()) {
+			
+			return(DB_LOCK_TABLE_FULL);
+		}
+	}
+
+	if (index->type & DICT_CLUSTERED) {
+		err = lock_clust_rec_read_check_and_lock(0,
+					rec, index, offsets, mode, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(0,
+					rec, index, offsets, mode, type, thr);
+	}
+
+	return(err);
+}
+
+/*************************************************************************
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+	sel_node_t*	node,		/* in: select node */
+	plan_t*		plan,		/* in: table plan */
+	ibool		search_latch_locked,
+					/* in: TRUE if the thread currently
+					has the search latch locked in
+					s-mode */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */ 
+	ulint		i;
+
+	if (search_latch_locked) {
+		has_search_latch = RW_S_LATCH;
+	}
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+	
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+	
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+	
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+			
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+		
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+	
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+							que_node_get_val(exp));
+		}
+	
+		/* Open pcur to the index */
+	
+		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+					node->latch_mode, &(plan->pcur),
+					has_search_latch, mtr);
+	} else {
+		/* Open the cursor to the start or the end of the index
+		(FALSE: no init) */
+
+		btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
+						&(plan->pcur), FALSE, mtr);
+	}
+
+	ut_ad(plan->n_rows_prefetched == 0);
+	ut_ad(plan->n_rows_fetched == 0);
+	ut_ad(plan->cursor_at_end == FALSE);
+ 
+	plan->pcur_is_open = TRUE;
+}
+
+/*************************************************************************
+Restores a stored pcur position to a table index. */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+				/* out: TRUE if the cursor should be moved to
+				the next record after we return from this
+				function (moved to the previous, in the case
+				of a descending cursor) without processing
+				again the current cursor record */
+	sel_node_t*	node,	/* in: select node */
+	plan_t*		plan,	/* in: table plan */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+	
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position = btr_pcur_restore_position(node->latch_mode,
+							&(plan->pcur), mtr);
+
+	/* If the cursor is traveling upwards, and relative_position is
+	
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+	
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE
+	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/* in: plan */
+{	
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;	
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+	
+/*************************************************************************
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+				/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+	sel_node_t*	node,	/* in: select node for a consistent read */
+	plan_t*		plan,	/* in: plan for a unique search in clustered
+				index */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ulint		ret;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	index = plan->index;
+
+	ut_ad(node->read_view);
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	
+	row_sel_open_pcur(node, plan, TRUE, mtr);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+	
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+	
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */ 
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+			
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (index->type & DICT_CLUSTERED) {
+		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+							node->read_view)) {
+			ret = SEL_RETRY;
+			goto func_exit;
+		}
+	} else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) {
+
+		ret = SEL_RETRY;
+		goto func_exit;
+	}
+
+	/* Test deleted flag. Fetch the columns needed in test conditions. */
+
+	row_sel_fetch_columns(index, rec, offsets,
+				UT_LIST_GET_FIRST(plan->columns));
+
+	if (rec_get_deleted_flag(rec, plan->table->comp)) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	/* Test the rest of search conditions */
+	
+	if (!row_sel_test_other_conds(plan)) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+	plan->n_rows_fetched++;
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(SEL_FOUND);
+}
+
+/*************************************************************************
+Performs a select step. */
+static
+ulint
+row_sel(
+/*====*/
+				/* out: DB_SUCCESS or error code */
+	sel_node_t*	node,	/* in: select node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+	ibool		search_latch_locked;
+	ibool		consistent_read;
+	
+		/* The following flag becomes TRUE when we are doing a
+		consistent read from a non-clustered index and we must look
+		at the clustered index to find out the previous delete mark
+		state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		leaf_contains_updates 		= FALSE;
+					/* TRUE if select_will_do_update is
+					TRUE and the current clustered index
+					leaf page has been updated during
+					the current mtr: mtr must be committed
+					at the same time as the leaf x-latch
+					is released */
+	ibool		mtr_has_extra_clust_latch 	= FALSE;
+					/* TRUE if the search was made using
+					a non-clustered index, and we had to
+					access the clustered record: now &mtr
+					contains a clustered index latch, and
+					&mtr must be committed before we move
+					to the next non-clustered record */
+	ulint		found_flag;
+	ulint		err;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(thr->run_node == node);
+
+	search_latch_locked = FALSE;
+
+	if (node->read_view) {
+		/* In consistent reads, we try to do with the hash index and
+		not to use the buffer page get. This is to reduce memory bus
+		load resulting from semaphore operations. The search latch
+		will be s-locked when we access an index with a unique search
+		condition, but not locked when we access an index with a
+		less selective search condition. */
+
+		consistent_read = TRUE;
+	} else {
+		consistent_read = FALSE;
+	}
+
+table_loop:
+	/* TABLE LOOP
+	   ----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(leaf_contains_updates == FALSE);
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+	
+	mtr_start(&mtr);
+
+	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+						&& !plan->must_get_clust) {
+		if (!search_latch_locked) {
+			rw_lock_s_lock(&btr_search_latch);
+
+			search_latch_locked = TRUE;
+		} else if (btr_search_latch.writer_is_wait_ex) {
+
+			/* There is an x-latch request waiting: release the
+			s-latch for a moment; as an s-latch here is often
+			kept for some 10 searches before being released,
+			a waiting x-latch request would block other threads
+			from acquiring an s-latch for a long time, lowering
+			performance significantly in multiprocessors. */
+
+			rw_lock_s_unlock(&btr_search_latch);
+			rw_lock_s_lock(&btr_search_latch);
+		}
+
+		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+		if (found_flag == SEL_FOUND) {
+
+			goto next_table;
+
+		} else if (found_flag == SEL_EXHAUSTED) {
+
+			goto table_exhausted;
+		}
+		
+		ut_ad(found_flag == SEL_RETRY);
+
+		plan_reset_cursor(plan);
+
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+	}
+
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		search_latch_locked = FALSE;
+	}
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+
+		row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+		
+			goto next_rec;
+		}
+	}
+	
+rec_loop:
+	/* RECORD LOOP
+	   -----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+	
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+	
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+		&& (rec != page_get_supremum_rec(buf_frame_align(rec)))) {
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+		
+		if (!consistent_read) {
+
+			/* If innodb_locks_unsafe_for_binlog option is used,
+			we lock only the record, i.e., next-key locking is
+			not used. */
+
+			rec_t*	next_rec = page_rec_get_next(rec);
+			ulint	lock_type;
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+			if (srv_locks_unsafe_for_binlog) {
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+			}
+
+			err = sel_set_rec_lock(next_rec, index, offsets,
+					node->row_lock_mode, lock_type, thr);
+
+			if (err != DB_SUCCESS) {
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+				
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+	if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (!consistent_read) {
+		/* Try to place a lock on the index record */	
+
+		/* If innodb_locks_unsafe_for_binlog option is used,
+		we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+		offsets = rec_get_offsets(rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+		if (srv_locks_unsafe_for_binlog) {
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(rec, index, offsets,
+					node->row_lock_mode, lock_type, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+		
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+	
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+	
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */ 
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+						< plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	} else if (plan->mixed_index) {
+	    	/* We have to check if the record in a mixed cluster belongs
+	    	to this table */
+
+	 	if (!dict_is_mixed_table_rec(plan->table, rec)) {
+
+	    		goto next_rec;
+	    	}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	cons_read_requires_clust_rec = FALSE;
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (consistent_read) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (index->type & DICT_CLUSTERED) {
+			
+			if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+							node->read_view)) {
+
+				err = row_sel_build_prev_vers(node->read_view,
+							plan, rec,
+							&offsets, &heap,
+							&old_vers, &mtr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+					row_sel_fetch_columns(index, rec,
+					    offsets,
+					    UT_LIST_GET_FIRST(plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec, index,
+							node->read_view)) {
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions */
+	
+	row_sel_fetch_columns(index, rec, offsets,
+					UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec, plan->table->comp)
+			&& !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+			
+			goto table_exhausted;
+		}
+		
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+									&mtr);
+		mtr_has_extra_clust_latch = TRUE;
+		
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(consistent_read);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec, plan->table->comp)) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}	
+
+	/* PHASE 6: Test the rest of search conditions */
+	
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+			
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+	
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+	if (node->select_will_do_update) {
+		/* This is a searched update and we can do the update in-place,
+		saving CPU time */
+
+		row_upd_in_place_in_select(node, thr, &mtr);
+
+		leaf_contains_updates = TRUE;
+
+		/* When the database is in the online backup mode, the number
+		of log records for a single mtr should be small: increment the
+		cost counter to ensure it */
+		
+		cost_counter += 1 + (SEL_COST_LIMIT / 8);
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;			
+		}
+
+		goto next_rec;
+	}	
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+				|| plan->unique_search || plan->no_prefetch) {
+
+		/* No prefetch in operation: go to the next table */
+	
+		goto next_table;
+	}
+
+	sel_push_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+		
+		sel_pop_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:	
+	ut_ad(!search_latch_locked);
+
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+	
+	if (leaf_contains_updates
+		&& btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) {
+
+		/* We must commit &mtr if we are moving to a different page,
+		because we have done updates to the x-latched leaf page, and
+		the latch would be released in btr_pcur_move_to_next, without
+		&mtr getting committed there */
+
+		ut_ad(node->asc);
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+		
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	   ------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr));
+	
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		ut_ad(!search_latch_locked);
+
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	leaf_contains_updates = FALSE;
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+	
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;			
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+		
+		thr->run_node = que_node_get_parent(node);
+
+		if (search_latch_locked) {
+			rw_lock_s_unlock(&btr_search_latch);
+		}
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+	
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+	
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */	
+
+	plan->cursor_at_end = TRUE;
+
+	mtr_commit(&mtr);
+
+	leaf_contains_updates = FALSE;
+	mtr_has_extra_clust_latch = FALSE;
+	
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+	
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+		err = DB_SUCCESS;
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+
+			if (search_latch_locked) {
+				rw_lock_s_unlock(&btr_search_latch);
+			}
+		
+			goto func_exit;
+		}
+
+		node->state = SEL_NODE_NO_MORE_ROWS;
+		
+		thr->run_node = que_node_get_parent(node);
+
+		if (search_latch_locked) {
+			rw_lock_s_unlock(&btr_search_latch);
+		}
+		
+		goto func_exit;
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */ 
+
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+		
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+	err = DB_SUCCESS;
+	goto func_exit;
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */ 
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	ut_ad(!search_latch_locked);
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+	leaf_contains_updates = FALSE;
+	mtr_has_extra_clust_latch = FALSE;
+	
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr)
+							|| !node->asc);
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+	
+	mtr_commit(&mtr);
+		
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/**************************************************************************
+Performs a select step. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_sel_step(
+/*=========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint		i_lock_mode;
+	sym_node_t*	table_node;
+	sel_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started(thr_get_trx(thr));
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			/* Assign a read view for the query */
+			node->read_view = trx_assign_read_view(
+							thr_get_trx(thr));
+		} else {
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+	
+			table_node = node->table_list;
+	
+			while (table_node) {
+				err = lock_table(0, table_node->table,
+							i_lock_mode, thr);
+				if (err != DB_SUCCESS) {
+	
+					que_thr_handle_error(thr, DB_ERROR,
+								NULL, 0);
+					return(NULL);
+				}
+	
+				table_node = que_node_get_next(table_node);
+			}
+		}
+	
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor &&
+				UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+		
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+	
+	thr->graph->last_sel_node = node;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+		fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+		que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+		return(NULL);
+	}
+
+	return(thr);
+} 
+
+/**************************************************************************
+Performs a fetch for a cursor. */
+
+que_thr_t*
+fetch_step(
+/*=======*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+	sel_node = node->cursor_def;
+	
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+			
+			sel_assign_into_var_values(node->into_list, sel_node);
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+		
+	sel_node->common.parent = node;
+	
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		/* SQL error detected */
+		fprintf(stderr, "SQL error %lu\n", (ulong)DB_ERROR);
+
+		que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+} 
+
+/***************************************************************
+Prints a row in a select result. */
+
+que_thr_t*
+row_printf_step(
+/*============*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+	
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+	
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+		
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+	
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		fputs(" ::: ", stderr);
+
+		arg = que_node_get_next(arg);
+	}
+
+	putc('\n', stderr);
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+} 
+
+/********************************************************************
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/* in: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/* in: buffer to use in field
+					conversions */
+	ulint		buf_len,	/* in: buffer length */
+	dict_index_t*	index,		/* in: index of the key value */
+	byte*		key_ptr,	/* in: MySQL key value */
+	ulint		key_len,	/* in: MySQL key value length */
+	trx_t*		trx)		/* in: transaction */
+{
+	byte*		original_buf	= buf;
+	byte*		original_key_ptr = key_ptr;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	ulint		data_offset;
+	ulint		data_len;
+	ulint		data_field_len;
+	ibool		is_null;
+	byte*		key_end;
+	ulint		n_fields = 0;
+	ulint		type;
+	
+	/* For documentation of the key value storage format in MySQL, see
+	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+	
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	field = dict_index_get_nth_field(index, 0);
+
+	if (dfield_get_type(dfield)->mtype == DATA_SYS) {
+		/* A special case: we are looking for a position in the
+		generated clustered index which InnoDB automatically added
+		to a table with no primary key: the first and the only
+		ordering column is ROW_ID which InnoDB stored to the key_ptr
+		buffer. */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+					
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+	while (key_ptr < key_end) {
+
+		ut_a(dict_col_get_type(field->col)->mtype
+		     == dfield_get_type(dfield)->mtype);
+
+		data_offset = 0;
+		is_null = FALSE;
+
+    		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+    			/* The first byte in the field tells if this is
+    			an SQL NULL value */
+    			
+			data_offset = 1;
+
+ 			if (*key_ptr != 0) {
+      				dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+				is_null = TRUE;
+      			}
+      		}
+
+		type = dfield_get_type(dfield)->mtype;
+
+		/* Calculate data length and data field total length */
+		
+		if (type == DATA_BLOB) {
+			/* The key field is a column prefix of a BLOB or
+			TEXT */
+
+			ut_a(field->prefix_len > 0);
+
+			/* MySQL stores the actual data length to the first 2
+			bytes after the optional SQL NULL marker byte. The
+			storage format is little-endian, that is, the most
+			significant byte at a higher address. In UTF-8, MySQL
+			seems to reserve field->prefix_len bytes for
+			storing this field in the key value buffer, even
+			though the actual value only takes data_len bytes
+			from the start. */
+
+			data_len = key_ptr[data_offset]
+				   + 256 * key_ptr[data_offset + 1];
+			data_field_len = data_offset + 2 + field->prefix_len;
+
+			data_offset += 2;
+
+			/* Now that we know the length, we store the column
+			value like it would be a fixed char field */
+
+		} else if (field->prefix_len > 0) {
+			/* Looks like MySQL pads unused end bytes in the
+			prefix with space. Therefore, also in UTF-8, it is ok
+			to compare with a prefix containing full prefix_len
+			bytes, and no need to take at most prefix_len / 3
+			UTF-8 characters from the start.
+			If the prefix is used as the upper end of a LIKE
+			'abc%' query, then MySQL pads the end with chars
+			0xff. TODO: in that case does it any harm to compare
+			with the full prefix_len bytes. How do characters
+			0xff in UTF-8 behave? */
+
+		        data_len = field->prefix_len;
+			data_field_len = data_offset + data_len;
+		} else {
+			data_len = dfield_get_type(dfield)->len;
+			data_field_len = data_offset + data_len;
+		}
+
+ 		if (dtype_get_mysql_type(dfield_get_type(dfield))
+					== DATA_MYSQL_TRUE_VARCHAR
+		    && dfield_get_type(dfield)->mtype != DATA_INT) {
+			/* In a MySQL key value format, a true VARCHAR is
+			always preceded by 2 bytes of a length field.
+			dfield_get_type(dfield)->len returns the maximum
+			'payload' len in bytes. That does not include the
+			2 bytes that tell the actual data length.
+
+			We added the check != DATA_INT to make sure we do
+			not treat MySQL ENUM or SET as a true VARCHAR! */
+
+			data_len += 2;
+			data_field_len += 2;
+		}
+
+		/* Storing may use at most data_len bytes of buf */
+		
+		if (!is_null) {
+		        row_mysql_store_col_in_innobase_format(
+					dfield,
+					buf,
+					FALSE, /* MySQL key value format col */
+					key_ptr + data_offset,
+					data_len,
+					index->table->comp);
+			buf += data_len;
+		}
+
+    		key_ptr += data_field_len;
+
+		if (key_ptr > key_end) {
+			/* The last field in key was not a complete key field
+			but a prefix of it.
+
+		        Print a warning about this! HA_READ_PREFIX_LAST does
+			not currently work in InnoDB with partial-field key
+			value prefixes. Since MySQL currently uses a padding
+			trick to calculate LIKE 'abc%' type queries there
+			should never be partial-field prefixes in searches. */
+
+		        ut_print_timestamp(stderr);
+			
+			fputs(
+  "  InnoDB: Warning: using a partial-field key prefix in search.\n"
+  "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+			fprintf(stderr, ". Last data field length %lu bytes,\n"
+  "InnoDB: key ptr now exceeds key end by %lu bytes.\n"
+  "InnoDB: Key value in the MySQL format:\n",
+					  (ulong) data_field_len,
+					  (ulong) (key_ptr - key_end));
+			fflush(stderr);
+			ut_print_buf(stderr, original_key_ptr, key_len);
+			fprintf(stderr, "\n");
+
+			if (!is_null) {
+			        dfield->len -= (ulint)(key_ptr - key_end);
+			}
+		}
+
+		n_fields++;    		
+    		field++;
+		dfield++;
+  	}
+
+	ut_a(buf <= original_buf + buf_len);
+
+ 	/* We set the length of tuple to n_fields: we assume that the memory
+	area allocated for it is big enough (usually bigger than n_fields). */
+ 	
+ 	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/******************************************************************
+Stores the row id to the prebuilt struct. */
+static
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt */
+	rec_t*		index_rec,	/* in: record */
+	dict_index_t*	index,		/* in: index of the record */
+	const ulint*	offsets)	/* in: rec_get_offsets
+					(index_rec, index) */
+{
+	byte*	data;
+	ulint	len;
+
+	ut_ad(rec_offs_validate(index_rec, index, offsets));
+
+	data = rec_get_nth_field(index_rec, offsets,
+			dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+	if (len != DATA_ROW_ID_LEN) {
+	        fprintf(stderr,
+"InnoDB: Error: Row id field is wrong length %lu in ", (ulong) len);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, "\n"
+"InnoDB: Field number %lu, record:\n",
+		    (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID));
+		rec_print_new(stderr, index_rec, offsets);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
+static
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+	byte*	dest,	/* in/out: buffer where to store; NOTE that BLOBs
+			are not in themselves stored here: the caller must
+			allocate and copy the BLOB into buffer before, and pass
+			the pointer to the BLOB in 'data' */
+	const mysql_row_templ_t* templ,	/* in: MySQL column template.
+			Its following fields are referenced:
+			type, is_unsigned, mysql_col_len, mbminlen, mbmaxlen */
+	byte*	data,	/* in: data to store */
+	ulint	len)	/* in: length of the data */
+{
+	byte*	ptr;
+	byte*	field_end;
+	byte*	pad_ptr;
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	if (templ->type == DATA_INT) {
+		/* Convert integer data from Innobase to a little-endian
+		format, sign bit restored to normal */
+
+		ptr = dest + len;
+
+		for (;;) {
+			ptr--;
+			*ptr = *data;
+			if (ptr == dest) {
+				break;
+			}
+			data++;
+		}
+
+		if (!templ->is_unsigned) {
+			dest[len - 1] = (byte) (dest[len - 1] ^ 128);
+		}
+
+		ut_ad(templ->mysql_col_len == len);
+	} else if (templ->type == DATA_VARCHAR
+	           || templ->type == DATA_VARMYSQL
+		   || templ->type == DATA_BINARY) {
+
+		field_end = dest + templ->mysql_col_len;
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+		
+			dest = row_mysql_store_true_var_len(dest, len,
+						templ->mysql_length_bytes);
+		}
+
+		/* Copy the actual data */
+		ut_memcpy(dest, data, len);
+		
+		/* Pad with trailing spaces. We pad with spaces also the
+		unused end of a >= 5.0.3 true VARCHAR column, just in case
+		MySQL expects its contents to be deterministic. */
+			
+		pad_ptr = dest + len;
+
+		ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+		/* We handle UCS2 charset strings differently. */
+		if (templ->mbminlen == 2) {
+			/* A space char is two bytes, 0x0020 in UCS2 */
+
+			if (len & 1) {
+				/* A 0x20 has been stripped from the column.
+				Pad it back. */
+				
+				if (pad_ptr < field_end) {
+					*pad_ptr = 0x20;
+					pad_ptr++;
+				}
+			}
+			
+			/* Pad the rest of the string with 0x0020 */
+
+			while (pad_ptr < field_end) {
+				*pad_ptr = 0x00;
+				pad_ptr++;
+				*pad_ptr = 0x20;
+				pad_ptr++;
+			}
+		} else {
+			ut_ad(templ->mbminlen == 1);
+			/* space=0x20 */
+
+			memset(pad_ptr, 0x20, field_end - pad_ptr);
+		}
+	} else if (templ->type == DATA_BLOB) {
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+									len);
+	} else if (templ->type == DATA_MYSQL) {
+		memcpy(dest, data, len);
+
+		ut_a(templ->mysql_col_len >= len);
+		ut_a(templ->mbmaxlen >= templ->mbminlen);
+
+		ut_a(templ->mbmaxlen > templ->mbminlen
+			|| templ->mysql_col_len == len);
+		/* The following assertion would fail for old tables
+		containing UTF-8 ENUM columns due to Bug #9526. */
+		ut_ad(!templ->mbmaxlen
+			|| !(templ->mysql_col_len % templ->mbmaxlen));
+		ut_a(len * templ->mbmaxlen >= templ->mysql_col_len);
+
+		if (templ->mbminlen != templ->mbmaxlen) {
+			/* Pad with spaces. This undoes the stripping
+			done in row0mysql.ic, function
+			row_mysql_store_col_in_innobase_format(). */
+
+			memset(dest + len, 0x20, templ->mysql_col_len - len);
+		}
+	} else {
+		ut_a(templ->type == DATA_CHAR
+			|| templ->type == DATA_FIXBINARY
+			/*|| templ->type == DATA_SYS_CHILD
+			|| templ->type == DATA_SYS*/
+			|| templ->type == DATA_FLOAT
+			|| templ->type == DATA_DOUBLE
+			|| templ->type == DATA_DECIMAL);
+		ut_ad(templ->mysql_col_len == len);
+
+		memcpy(dest, data, len);
+	}
+}
+
+/******************************************************************
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query. */
+static
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+					/* out: TRUE if success, FALSE if
+					could not allocate memory for a BLOB
+					(though we may also assert in that
+					case) */
+	byte*		mysql_rec,	/* out: row in the MySQL format */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
+	rec_t*		rec,		/* in: Innobase record in the index
+					which was described in prebuilt's
+					template */
+	const ulint*	offsets)	/* in: array returned by
+					rec_get_offsets() */
+{
+	mysql_row_templ_t*	templ;
+	mem_heap_t*		extern_field_heap	= NULL;
+	byte*			data;
+	ulint			len;
+	byte*			blob_buf;
+	int			pad_char;
+	ulint			i;
+	
+	ut_ad(prebuilt->mysql_template);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (prebuilt->blob_heap != NULL) {
+		mem_heap_free(prebuilt->blob_heap);
+		prebuilt->blob_heap = NULL;
+	}
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+
+		data = rec_get_nth_field(rec, offsets,
+					templ->rec_field_no, &len);
+
+		if (rec_offs_nth_extern(offsets, templ->rec_field_no)) {
+
+			/* Copy an externally stored field to the temporary
+			heap */
+
+			ut_a(!prebuilt->trx->has_search_latch);
+
+			extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE);
+
+			/* NOTE: if we are retrieving a big BLOB, we may
+			already run out of memory in the next call, which
+			causes an assert */
+
+			data = btr_rec_copy_externally_stored_field(rec,
+					offsets, templ->rec_field_no, &len,
+					extern_field_heap);
+
+			ut_a(len != UNIV_SQL_NULL);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			if (templ->type == DATA_BLOB) {
+
+				ut_a(prebuilt->templ_contains_blob);
+
+				/* A heuristic test that we can allocate the
+				memory for a big BLOB. We have a safety margin
+				of 1000000 bytes. Since the test takes some
+				CPU time, we do not use it for small BLOBs. */
+
+				if (len > 2000000
+				    && !ut_test_malloc(len + 1000000)) {
+
+					ut_print_timestamp(stderr);
+					fprintf(stderr,
+"  InnoDB: Warning: could not allocate %lu + 1000000 bytes to retrieve\n"
+"InnoDB: a big column. Table name ", (ulong) len);
+					ut_print_name(stderr,
+						prebuilt->trx,
+						prebuilt->table->name);
+					putc('\n', stderr);
+
+					if (extern_field_heap) {
+						mem_heap_free(
+							extern_field_heap);
+					}
+					return(FALSE);
+				}
+
+				/* Copy the BLOB data to the BLOB heap of
+				prebuilt */
+
+				if (prebuilt->blob_heap == NULL) {
+					prebuilt->blob_heap =
+						mem_heap_create(len);
+				}
+
+				blob_buf = mem_heap_alloc(prebuilt->blob_heap,
+									len);
+				ut_memcpy(blob_buf, data, len);
+
+				data = blob_buf;
+			}
+		
+			row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ, data, len);
+
+			/* Cleanup */
+			if (extern_field_heap) {
+ 				mem_heap_free(extern_field_heap);
+				extern_field_heap = NULL;
+ 			}
+			
+			if (templ->mysql_null_bit_mask) {
+				/* It is a nullable column with a non-NULL
+				value */
+				mysql_rec[templ->mysql_null_byte_offset] &=
+					~(byte) (templ->mysql_null_bit_mask);
+			}
+		} else {
+		        /* MySQL seems to assume the field for an SQL NULL
+		        value is set to zero or space. Not taking this into
+			account caused seg faults with NULL BLOB fields, and
+		        bug number 154 in the MySQL bug database: GROUP BY
+		        and DISTINCT could treat NULL values inequal. */
+
+			mysql_rec[templ->mysql_null_byte_offset] |=
+					(byte) (templ->mysql_null_bit_mask);
+			if (templ->type == DATA_VARCHAR
+			    || templ->type == DATA_CHAR
+			    || templ->type == DATA_BINARY
+			    || templ->type == DATA_FIXBINARY
+			    || templ->type == DATA_MYSQL
+			    || templ->type == DATA_VARMYSQL) {
+			        /* MySQL pads all non-BLOB and non-TEXT
+				string types with space ' ' */
+			    
+				pad_char = ' ';
+			} else {
+				pad_char = '\0';
+			}
+
+			/* Handle UCS2 strings differently. */
+			if (pad_char != '\0' && templ->mbminlen == 2) {
+				/* There are two bytes per char, so the length
+				has to be an even number. */
+				ut_a(!(templ->mysql_col_len & 1));
+				data = mysql_rec + templ->mysql_col_offset;
+				len = templ->mysql_col_len;
+				/* Pad with 0x0020. */
+				while (len >= 2) {
+					*data++ = 0x00;
+					*data++ = 0x20;
+					len -= 2;
+				}
+			} else {
+				ut_ad(!pad_char || templ->mbminlen == 1);
+				memset(mysql_rec + templ->mysql_col_offset,
+					pad_char, templ->mysql_col_len);
+			}
+		}
+	} 
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+					/* out: DB_SUCCESS or error code */
+	read_view_t*	read_view,	/* in: read view */
+	dict_index_t*	clust_index,	/* in: clustered index */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
+	rec_t*		rec,		/* in: record in a clustered index */
+	ulint**		offsets,	/* in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/* in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/* out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	ulint	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+	
+	err = row_vers_build_for_consistent_read(rec, mtr, clust_index,
+					offsets, read_view, offset_heap,
+					prebuilt->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface. */
+static
+ulint
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+				/* out: DB_SUCCESS or error code */
+	row_prebuilt_t*	prebuilt,/* in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/* in: secondary index where rec resides */
+	rec_t*		rec,	/* in: record in a non-clustered index; if
+				this is a locking read, then rec is not
+				allowed to be delete-marked, and that would
+				not make sense either */
+	que_thr_t*	thr,	/* in: query thread */
+	rec_t**		out_rec,/* out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	ulint**		offsets,/* out: offsets returned by
+				rec_get_offsets(out_rec, clust_index) */
+	mem_heap_t**	offset_heap,/* in/out: memory heap from which
+				the offsets are allocated */
+	mtr_t*		mtr)	/* in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	trx_t*		trx;
+
+	*out_rec = NULL;
+	trx = thr_get_trx(thr);
+	
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec, trx);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+	
+	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+			PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			prebuilt->clust_pcur, 0, mtr);
+
+	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+	prebuilt->clust_pcur->trx_if_known = trx;
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(prebuilt->clust_pcur)
+	       < dict_index_get_n_unique(clust_index)) {
+	
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.c
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		if (!rec_get_deleted_flag(rec, sec_index->table->comp)
+		    || prebuilt->select_lock_type != LOCK_NONE) {
+		        ut_print_timestamp(stderr);
+			fputs("  InnoDB: error clustered record"
+				" for sec rec not found\n"
+				"InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, sec_index);
+			fputs("\n"
+				"InnoDB: sec index record ", stderr);
+			rec_print(stderr, rec, sec_index);
+			fputs("\n"
+				"InnoDB: clust index record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			putc('\n', stderr);
+			trx_print(stderr, trx);
+
+			fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+		}
+
+		clust_rec = NULL;
+
+		goto func_exit;
+	}
+
+	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+					ULINT_UNDEFINED, offset_heap);
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
+		
+		err = lock_clust_rec_read_check_and_lock(0, clust_rec,
+					clust_index, *offsets,
+					prebuilt->select_lock_type,
+					LOCK_REC_NOT_GAP, thr);
+		if (err != DB_SUCCESS) {
+
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		/* If the isolation level allows reading of uncommitted data,
+		then we never look for an earlier version */
+
+		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && !lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+						*offsets, trx->read_view)) {
+
+			err = row_sel_build_prev_vers_for_mysql(
+					trx->read_view, clust_index,
+					prebuilt, clust_rec,
+					offsets, offset_heap,
+					&old_vers, mtr);
+						
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+		
+		if (clust_rec && (old_vers
+			|| rec_get_deleted_flag(rec, sec_index->table->comp))
+		    && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+						clust_rec, clust_index)) {
+			clust_rec = NULL;
+		} else {
+#ifdef UNIV_SEARCH_DEBUG
+			ut_a(clust_rec == NULL ||
+			    row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+						clust_rec, clust_index));
+#endif		
+		}
+	}
+
+func_exit:
+	*out_rec = clust_rec;
+
+	if (prebuilt->select_lock_type == LOCK_X) {
+		/* We may use the cursor in update: store its position */
+		
+		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+	}
+
+	err = DB_SUCCESS;
+err_exit:
+	return(err);
+}
+
+/************************************************************************
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down. */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+					/* out: TRUE if we may need to
+					process the record the cursor is
+					now positioned on (i.e. we should
+					not go to the next record yet) */
+	ulint		latch_mode,	/* in: latch mode wished in
+					restoration */
+	btr_pcur_t*	pcur,		/* in: cursor whose position
+					has been stored */
+	ibool		moves_up,	/* in: TRUE if the cursor moves up
+					in the index */
+	mtr_t*		mtr)		/* in: mtr; CAUTION: may commit
+					mtr temporarily! */
+{
+	ibool	success;
+	ulint	relative_position;
+
+	relative_position = pcur->rel_pos;
+	
+	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+	if (relative_position == BTR_PCUR_ON) {
+		if (success) {
+			return(FALSE);
+		}
+
+		if (moves_up) {
+			btr_pcur_move_to_next(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	if (relative_position == BTR_PCUR_AFTER
+	    || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
+
+		if (moves_up) {
+			return(TRUE);
+		}
+					
+		if (btr_pcur_is_on_user_rec(pcur, mtr)) {
+			btr_pcur_move_to_prev(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_BEFORE
+	     || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
+	
+	if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) {
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(TRUE);
+}
+
+/************************************************************************
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+	byte*		buf,		/* in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct */
+{
+	ulint			i;
+	mysql_row_templ_t*	templ;
+	byte*			cached_rec;
+        ut_ad(prebuilt->n_fetch_cached > 0);
+	
+	if (prebuilt->keep_other_fields_on_keyread)
+	{
+		/* Copy cache record field by field, don't touch fields that 
+		are not covered by current key */
+		cached_rec = 
+			prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			templ = prebuilt->mysql_template + i;
+			ut_memcpy(
+				buf + templ->mysql_col_offset, 
+				cached_rec + templ->mysql_col_offset,
+				templ->mysql_col_len);
+			/* Copy NULL bit of the current field from cached_rec 
+			to buf */
+			if (templ->mysql_null_bit_mask)
+			{
+				buf[templ->mysql_null_byte_offset] ^=
+				  (buf[templ->mysql_null_byte_offset] ^
+				   cached_rec[templ->mysql_null_byte_offset]) &
+				  (byte)templ->mysql_null_bit_mask;
+			}
+		}
+	}
+	else
+	{
+		ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+				prebuilt->mysql_row_len);
+	}
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/************************************************************************
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
+	rec_t*		rec,		/* in: record to push */
+	const ulint*	offsets)	/* in: rec_get_offsets() */
+{
+	byte*	buf;
+	ulint	i;
+
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_a(!prebuilt->templ_contains_blob);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+
+			/* A user has reported memory corruption in these
+			buffers in Linux. Put magic numbers there to help
+			to track a possible bug. */
+			
+			buf = mem_alloc(prebuilt->mysql_row_len + 8);
+
+			prebuilt->fetch_cache[i] = buf + 4;
+				
+			mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
+			mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
+					ROW_PREBUILT_FETCH_MAGIC_N);
+		}
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+
+	if (!row_sel_store_mysql_rec(
+			prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+			prebuilt, rec, offsets)) {
+		ut_error;
+	}
+
+	prebuilt->n_fetch_cached++;
+}
+
+/*************************************************************************
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode. */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+				/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+	rec_t**		out_rec,/* out: record if found */
+	row_prebuilt_t*	prebuilt,/* in: prebuilt struct */
+	ulint**		offsets,/* in/out: for rec_get_offsets(*out_rec) */
+	mem_heap_t**	heap,	/* in/out: heap for rec_get_offsets() */
+	mtr_t*		mtr)	/* in: started mtr */
+{
+	dict_index_t*	index		= prebuilt->index;
+	dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	rec_t*		rec;
+	
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(!prebuilt->templ_contains_blob);
+	
+	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+					BTR_SEARCH_LEAF, pcur,
+#ifndef UNIV_SEARCH_DEBUG
+					RW_S_LATCH,
+#else
+					0,
+#endif
+					mtr);
+	rec = btr_pcur_get_rec(pcur);
+	
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */ 
+
+	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	*offsets = rec_get_offsets(rec, index, *offsets,
+					ULINT_UNDEFINED, heap);
+
+	if (!lock_clust_rec_cons_read_sees(rec, index,
+				*offsets, trx->read_view)) {
+
+		return(SEL_RETRY);
+	}
+
+	if (rec_get_deleted_flag(rec, index->table->comp)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	*out_rec = rec;
+	
+	return(SEL_FOUND);
+}
+
+/************************************************************************
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor! */
+
+ulint
+row_search_for_mysql(
+/*=================*/
+					/* out: DB_SUCCESS,
+					DB_RECORD_NOT_FOUND, 
+					DB_END_OF_INDEX, DB_DEADLOCK,
+					DB_LOCK_TABLE_FULL, DB_CORRUPTION,
+					or DB_TOO_BIG_RECORD */
+	byte*		buf,		/* in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/* in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/* in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */ 
+	ulint		direction)	/* in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+{
+	dict_index_t*	index		= prebuilt->index;
+	dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	rec_t*		rec;
+	rec_t*		index_rec;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err             = DB_SUCCESS;
+	ibool		moved;
+	ibool		cons_read_requires_clust_rec;
+	ibool		was_lock_wait;
+	ulint		shortcut;
+	ibool		unique_search			= FALSE;
+	ibool		unique_search_from_clust_index	= FALSE;
+	ibool		mtr_has_extra_clust_latch 	= FALSE;
+	ibool		moves_up 			= FALSE;
+	ibool		set_also_gap_locks		= TRUE;
+					/* if the query is a plain
+					locking SELECT, and the isolation
+					level is <= TRX_ISO_READ_COMMITTED,
+					then this is set to FALSE */
+	ibool		success;
+	ibool		comp;
+	ulint		cnt				= 0;
+	ulint		next_offs;
+	mtr_t		mtr;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(index && pcur && search_tuple);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (prebuilt->table->ibd_file_missing) {
+	        ut_print_timestamp(stderr);
+	        fprintf(stderr, "  InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+				prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+		"InnoDB: Error: trying to free a corrupt\n"
+		"InnoDB: table handle. Magic n %lu, table name ",
+		(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption((byte*)prebuilt);
+
+		ut_error;
+	}
+
+	if (trx->n_mysql_tables_in_use == 0
+            && prebuilt->select_lock_type == LOCK_NONE) {
+		/* Note that if MySQL uses an InnoDB temp table that it
+		created inside LOCK TABLES, then n_mysql_tables_in_use can
+		be zero; in that case select_lock_type is set to LOCK_X in
+		::start_stmt. */
+
+		fputs(
+"InnoDB: Error: MySQL is trying to perform a SELECT\n"
+"InnoDB: but it has not locked any tables in ::external_lock()!\n",
+                      stderr);
+		trx_print(stderr, trx);
+                fputc('\n', stderr);
+	}
+
+/*	fprintf(stderr, "Match mode %lu\n search tuple ", (ulong) match_mode);
+	dtuple_print(search_tuple);
+	
+	fprintf(stderr, "N tables locked %lu\n", trx->mysql_n_tables_locked);
+*/
+	/*-------------------------------------------------------------*/
+	/* PHASE 0: Release a possible s-latch we are holding on the
+	adaptive hash index latch if there is someone waiting behind */
+
+	if (trx->has_search_latch
+	    && btr_search_latch.writer != RW_LOCK_NOT_LOCKED) {
+
+		/* There is an x-latch request on the adaptive hash index:
+		release the s-latch to reduce starvation and wait for
+		BTR_SEA_TIMEOUT rounds before trying to keep it again over
+		calls from MySQL */
+
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+
+		trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+	}
+	
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
+	if (direction == 0) {
+		trx->op_info = "starting index read";
+	
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		trx->op_info = "fetching rows";
+
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (direction != prebuilt->fetch_direction) {
+			if (prebuilt->n_fetch_cached > 0) {
+				ut_error;
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+		
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (prebuilt->n_fetch_cached > 0) {
+			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+
+			srv_n_rows_read++;
+			err = DB_SUCCESS;
+			goto func_exit;
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+		    	/* The previous returned row was popped from the fetch
+		    	cache, but the cache was not full at the time of the
+		    	popping: no more rows can exist in the result set */
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+		
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete-
+	marked matching record.
+
+	Note that in a unique secondary index there may be different delete-
+	marked versions of a record where only the primary key values differ:
+	thus in a secondary index we must use next-key locks when locking
+	delete-marked records. */
+	
+	if (match_mode == ROW_SEL_EXACT
+	    && index->type & DICT_UNIQUE
+	    && dtuple_get_n_fields(search_tuple)
+					== dict_index_get_n_unique(index)
+	    && (index->type & DICT_CLUSTERED
+		 || !dtuple_contains_null(search_tuple))) {
+
+		/* Note above that a UNIQUE secondary index can contain many
+		rows with the same key value if one of the columns is the SQL
+		null. A clustered index under MySQL can never contain null
+		columns because we demand that all the columns in primary key
+		are non-null. */
+
+		unique_search = TRUE;
+
+		/* Even if the condition is unique, MySQL seems to try to
+		retrieve also a second row if a primary key contains more than
+		1 column. Return immediately if this is not a HANDLER
+		command. */
+
+		if (direction != 0 && !prebuilt->used_in_HANDLER) {
+        
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (unique_search
+	    && index->type & DICT_CLUSTERED
+	    && direction == 0
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->used_in_HANDLER
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
+
+		mode = PAGE_CUR_GE;
+
+		unique_search_from_clust_index = TRUE;
+
+		if (trx->mysql_n_tables_locked == 0
+		    && prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view) {
+
+			/* This is a SELECT query done as a consistent read,
+			and the read view has already been allocated:
+			let us try a search shortcut through the hash
+			index.
+			NOTE that we must also test that
+			mysql_n_tables_locked == 0, because this might
+			also be INSERT INTO ... SELECT ... or
+			CREATE TABLE ... SELECT ... . Our algorithm is
+			NOT prepared to inserts interleaved with the SELECT,
+			and if we try that, we can deadlock on the adaptive
+			hash index semaphore! */
+
+#ifndef UNIV_SEARCH_DEBUG			
+			if (!trx->has_search_latch) {
+				rw_lock_s_lock(&btr_search_latch);
+				trx->has_search_latch = TRUE;
+			}
+#endif
+			shortcut = row_sel_try_search_shortcut_for_mysql(&rec,
+					prebuilt, &offsets, &heap, &mtr);
+			if (shortcut == SEL_FOUND) {
+#ifdef UNIV_SEARCH_DEBUG
+				ut_a(0 == cmp_dtuple_rec(search_tuple,
+							rec, offsets));
+#endif 
+				if (!row_sel_store_mysql_rec(buf, prebuilt,
+							rec, offsets)) {
+ 					err = DB_TOO_BIG_RECORD;
+
+					/* We let the main loop to do the
+					error handling */
+ 					goto shortcut_fails_too_big_rec;
+				}
+	
+ 				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" shortcut\n", stderr); */
+
+				srv_n_rows_read++;
+				
+				if (trx->search_latch_timeout > 0
+				    && trx->has_search_latch) {
+
+					trx->search_latch_timeout--;
+
+			        	rw_lock_s_unlock(&btr_search_latch);
+					trx->has_search_latch = FALSE;
+				}    	
+				
+				/* NOTE that we do NOT store the cursor
+				position */
+				err = DB_SUCCESS;
+				goto func_exit;
+			
+			} else if (shortcut == SEL_EXHAUSTED) {
+
+ 				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" record not found 2\n", stderr); */
+
+				if (trx->search_latch_timeout > 0
+				    && trx->has_search_latch) {
+
+					trx->search_latch_timeout--;
+
+			        	rw_lock_s_unlock(&btr_search_latch);
+					trx->has_search_latch = FALSE;
+				}
+
+				/* NOTE that we do NOT store the cursor
+				position */
+
+				err = DB_RECORD_NOT_FOUND;
+				goto func_exit;
+			}
+shortcut_fails_too_big_rec:
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+		}
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+	}			
+
+	trx_start_if_not_started(trx);
+
+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    && prebuilt->select_lock_type != LOCK_NONE
+	    && trx->mysql_query_str) {
+
+		/* Scan the MySQL query string; check if SELECT is the first
+	        word there */
+
+		dict_accept(*trx->mysql_query_str, "SELECT", &success);
+
+		if (success) {
+			/* It is a plain locking SELECT and the isolation
+			level is low: do not lock gaps */
+
+			set_also_gap_locks = FALSE;
+		}
+	}
+	
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+	
+	if (direction == 0) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+			moves_up = TRUE;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = TRUE;
+	}
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (direction != 0) {		
+		moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+							moves_up, &mtr);
+		if (!moved) {
+			goto next_rec;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+		btr_pcur_open_with_no_init(index, search_tuple, mode,
+					BTR_SEARCH_LEAF,
+					pcur, 0, &mtr);
+
+		pcur->trx_if_known = trx;
+	} else {
+		if (mode == PAGE_CUR_G) {
+			btr_pcur_open_at_index_side(TRUE, index,
+					BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+		} else if (mode == PAGE_CUR_L) {
+			btr_pcur_open_at_index_side(FALSE, index,
+					BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+		}
+	}
+
+	if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+
+		if (trx->read_view == NULL
+		    && prebuilt->select_lock_type == LOCK_NONE) {
+
+			fputs(
+"InnoDB: Error: MySQL is trying to perform a consistent read\n"
+"InnoDB: but the read view is not assigned!\n", stderr);
+			trx_print(stderr, trx);
+                        fputc('\n', stderr);
+			ut_a(0);
+		}
+	} else if (prebuilt->select_lock_type == LOCK_NONE) {
+		/* This is a consistent read */	
+		/* Assign a read view for the query */
+
+		trx_assign_read_view(trx);
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		if (prebuilt->select_lock_type == LOCK_S) {		
+			err = lock_table(0, index->table, LOCK_IS, thr);
+		} else {
+			err = lock_table(0, index->table, LOCK_IX, thr);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+		prebuilt->sql_stat_start = FALSE;
+	}
+
+rec_loop:
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+	
+	rec = btr_pcur_get_rec(pcur);
+	comp = index->table->comp;
+	ut_ad(comp == page_is_comp(buf_frame_align(rec)));
+/*
+	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
+			buf_frame_get_page_no(buf_frame_align(rec)));
+	rec_print(rec);
+*/
+	if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+
+	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+		if (prebuilt->select_lock_type != LOCK_NONE
+		    && set_also_gap_locks) {
+
+			/* Try to place a lock on the index record */
+
+			/* If innodb_locks_unsafe_for_binlog option is used,
+			we do not lock gaps. Supremum record is really
+			a gap and therefore we do not set locks there. */
+			
+			if (!srv_locks_unsafe_for_binlog) {
+				offsets = rec_get_offsets(rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+				err = sel_set_rec_lock(rec, index, offsets,
+						prebuilt->select_lock_type,
+						LOCK_ORDINARY, thr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
+		}
+		/* A page supremum record cannot be in the result set: skip
+		it now that we have placed a possible lock on it */
+		
+		goto next_rec;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* Do sanity checks in case our cursor has bumped into page
+	corruption */
+	
+	next_offs = rec_get_next_offs(rec, comp);
+
+	if (next_offs >= UNIV_PAGE_SIZE
+		|| next_offs <
+		(ulint) (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)) {
+
+		if (srv_force_recovery == 0 || moves_up == FALSE) {
+			ut_print_timestamp(stderr);
+			buf_page_print(buf_frame_align(rec));
+			fprintf(stderr,
+"\nInnoDB: rec address %p, first buffer frame %p\n"
+"InnoDB: buffer pool high end %p, buf block fix count %lu\n",
+				rec, buf_pool->frame_zero,
+				buf_pool->high_end,
+				(ulong)buf_block_align(rec)->buf_fix_count);
+			fprintf(stderr,
+"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
+"InnoDB: ",
+				(ulong) (rec - buf_frame_align(rec)),
+				(ulong) next_offs,
+				(ulong) buf_frame_get_page_no(rec));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". Run CHECK TABLE. You may need to\n"
+"InnoDB: restore from a backup, or dump + drop + reimport the table.\n",
+			      stderr);
+		
+			err = DB_CORRUPTION;
+
+			goto lock_wait_or_error;
+		} else {
+			/* The user may be dumping a corrupt table. Jump
+			over the corruption to recover as much as possible. */
+
+			fprintf(stderr,
+"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
+"InnoDB: ",
+			   (ulong) (rec - buf_frame_align(rec)),
+			   (ulong) next_offs,
+			   (ulong) buf_frame_get_page_no(rec));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the rest of the page.\n",
+				stderr);
+
+			btr_pcur_move_to_last_on_page(pcur, &mtr);
+
+			goto next_rec;
+		}
+	}
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (srv_force_recovery > 0) {
+		if (!rec_validate(rec, offsets)
+		|| !btr_index_rec_validate(rec, index, FALSE)) {
+			fprintf(stderr,
+"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
+"InnoDB: ",
+			   (ulong) (rec - buf_frame_align(rec)),
+			   (ulong) next_offs,
+			   (ulong) buf_frame_get_page_no(rec));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the record.\n",
+				stderr);
+
+			goto next_rec;
+		}
+	}
+
+	/*-------------------------------------------------------------*/
+
+	/* Note that we cannot trust the up_match value in the cursor at this
+	place because we can arrive here after moving the cursor! Thus
+	we have to recompare rec and search_tuple to determine if they
+	match enough. */
+
+	if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		/* fputs("Comparing rec and search tuple\n", stderr); */
+		
+		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+			if (prebuilt->select_lock_type != LOCK_NONE
+		    	    && set_also_gap_locks) {
+
+				/* Try to place a gap lock on the index 
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set */
+
+				if (srv_locks_unsafe_for_binlog == FALSE) { 
+
+					err = sel_set_rec_lock(rec, index,
+						offsets,
+						prebuilt->select_lock_type,
+						LOCK_GAP, thr);
+					if (err != DB_SUCCESS) {
+
+						goto lock_wait_or_error;
+					}
+				}
+
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			err = DB_RECORD_NOT_FOUND;
+			/* ut_print_name(stderr, index->name);
+			fputs(" record not found 3\n", stderr); */
+			
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+			
+			if (prebuilt->select_lock_type != LOCK_NONE
+			    && set_also_gap_locks) {
+
+				/* Try to place a gap lock on the index 
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set */
+
+				if (srv_locks_unsafe_for_binlog == FALSE) {
+
+					err = sel_set_rec_lock(rec, index,
+						offsets,
+						prebuilt->select_lock_type,
+						LOCK_GAP, thr);
+					if (err != DB_SUCCESS) {
+
+						goto lock_wait_or_error;
+					}
+				}
+
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			err = DB_RECORD_NOT_FOUND;
+			/* ut_print_name(stderr, index->name);
+			fputs(" record not found 4\n", stderr); */
+
+			goto normal_return;
+		}
+	}
+		
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	cons_read_requires_clust_rec = FALSE;
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		ulint	lock_type;
+
+		if (!set_also_gap_locks
+		    || (unique_search && !rec_get_deleted_flag(rec, comp))) {
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			/* If innodb_locks_unsafe_for_binlog option is used, 
+			we lock only the record, i.e., next-key locking is
+			not used. */
+
+	                if (srv_locks_unsafe_for_binlog) {
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+ 			}
+		}
+
+		/* If we are doing a 'greater or equal than a primary key
+		value' search from a clustered index, and we find a record
+		that has that exact primary key value, then there is no need
+		to lock the gap before the record, because no insert in the
+		gap can be in our search range. That is, no phantom row can
+		appear that way.
+
+		An example: if col1 is the primary key, the search is WHERE
+		col1 >= 100, and we find a record where col1 = 100, then no
+		need to lock the gap before that record. */
+
+		if (index == clust_index
+		    && mode == PAGE_CUR_GE
+		    && direction == 0
+		    && dtuple_get_n_fields_cmp(search_tuple)
+		       == dict_index_get_n_unique(index)
+		    && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+			lock_type = LOCK_REC_NOT_GAP;
+		}
+
+		err = sel_set_rec_lock(rec, index, offsets,
+					prebuilt->select_lock_type,
+					lock_type, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+		
+		} else if (index == clust_index) {
+			  
+			/* Fetch a previous version of the row if the current
+			one is not visible in the snapshot; if we have a very
+			high force recovery level set, we try to avoid crashes
+			by skipping this lookup */
+
+			if (srv_force_recovery < 5
+                            && !lock_clust_rec_cons_read_sees(rec, index,
+						offsets, trx->read_view)) {
+
+				err = row_sel_build_prev_vers_for_mysql(
+						trx->read_view, clust_index,
+						prebuilt, rec,
+						&offsets, &heap,
+						&old_vers, &mtr);
+						
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec, index,
+							trx->read_view)) {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+			
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec, comp)
+			&& !cons_read_requires_clust_rec) {
+
+		/* The record is delete-marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+		
+		goto next_rec;
+	}
+
+	/* Get the clustered index record if needed and if we did
+	not do the search using the clustered index */
+
+	index_rec = rec;
+
+	/* Before and after the following "if" block, "offsets" will be
+	related to "rec", which may be in "index", a secondary index or
+	the clustered index ("clust_index").  However, after this "if" block,
+	"rec" may be pointing to "clust_rec" of "clust_index". */
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (index != clust_index && (cons_read_requires_clust_rec
+				|| prebuilt->need_to_access_clustered)) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_has_extra_clust_latch = TRUE;
+		
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+							thr, &clust_rec,
+							&offsets, &heap, &mtr);
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec, comp)) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+		
+		if (prebuilt->need_to_access_clustered) {
+		        rec = clust_rec;
+			ut_ad(rec_offs_validate(rec, clust_index, offsets));
+		} else {
+			offsets = rec_get_offsets(rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+		}
+	}
+
+	/* We found a qualifying row */
+	ut_ad(rec_offs_validate(rec,
+				rec == clust_rec ? clust_index : index,
+				offsets));
+
+	if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD
+			&& prebuilt->select_lock_type == LOCK_NONE
+			&& !prebuilt->templ_contains_blob
+			&& !prebuilt->clust_index_was_generated
+			&& !prebuilt->used_in_HANDLER
+	                && prebuilt->template_type
+	                                 != ROW_MYSQL_DUMMY_TEMPLATE) {
+
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE.
+		Since we keep space in prebuilt only for the BLOBs of
+		a single row, we cannot cache rows in the case there
+		are BLOBs in the fields to be fetched. In HANDLER we do
+		not cache rows because there the cursor is a scrollable
+		cursor. */
+
+		row_sel_push_cache_row_for_mysql(prebuilt, rec, offsets);
+
+		if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
+			
+			goto got_row;
+		}
+
+		goto next_rec;
+	} else {
+		if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
+			memcpy(buf + 4, rec - rec_offs_extra_size(offsets),
+					rec_offs_size(offsets));
+			mach_write_to_4(buf,
+					rec_offs_extra_size(offsets) + 4);
+		} else {
+			if (!row_sel_store_mysql_rec(buf, prebuilt,
+							rec, offsets)) {
+				err = DB_TOO_BIG_RECORD;
+
+				goto lock_wait_or_error;
+			}
+		}
+
+		if (prebuilt->clust_index_was_generated) {
+			if (rec != index_rec) {
+				offsets = rec_get_offsets(
+						index_rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+			}
+			row_sel_store_row_id_to_prebuilt(prebuilt, index_rec,
+							index, offsets);
+		}
+	}
+got_row:
+	/* We have an optimization to save CPU time: if this is a consistent
+	read on a unique condition on the clustered index, then we do not
+	store the pcur position, because any fetch next or prev will anyway
+	return 'end of file'. An exception is the MySQL HANDLER command
+	where the user can move the cursor with PREV or NEXT even after
+	a unique search. */
+
+	if (!unique_search_from_clust_index
+	    || prebuilt->select_lock_type == LOCK_X
+	    || prebuilt->used_in_HANDLER) {
+
+		/* Inside an update always store the cursor position */
+
+		btr_pcur_store_position(pcur, &mtr);
+	}
+
+	err = DB_SUCCESS;
+
+	goto normal_return;
+
+next_rec:
+	/*-------------------------------------------------------------*/
+	/* PHASE 5: Move the cursor to the next index record */
+	
+	if (mtr_has_extra_clust_latch) {
+		/* We must commit mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		btr_pcur_store_position(pcur, &mtr);
+
+		mtr_commit(&mtr);
+		mtr_has_extra_clust_latch = FALSE;
+	
+		mtr_start(&mtr);
+		moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+							moves_up, &mtr);
+		if (moved) {
+			cnt++;
+
+			goto rec_loop;
+		}
+	}
+
+	if (moves_up) {		
+		moved = btr_pcur_move_to_next(pcur, &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(pcur, &mtr);
+	}
+
+	if (!moved) {
+		btr_pcur_store_position(pcur, &mtr);
+
+		if (match_mode != 0) {
+			err = DB_RECORD_NOT_FOUND;
+		} else {
+			err = DB_END_OF_INDEX;
+		}
+
+		goto normal_return;
+	}
+
+	cnt++;
+
+	goto rec_loop;
+
+lock_wait_or_error:
+	/*-------------------------------------------------------------*/
+
+	btr_pcur_store_position(pcur, &mtr);
+
+	mtr_commit(&mtr);
+	mtr_has_extra_clust_latch = FALSE;
+		
+	trx->error_state = err;
+
+	/* The following is a patch for MySQL */
+
+	que_thr_stop_for_mysql(thr);
+
+  thr->lock_state= QUE_THR_LOCK_ROW;
+	was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+  thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+	if (was_lock_wait) {
+		mtr_start(&mtr);
+
+		sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+							moves_up, &mtr);
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+
+/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+	goto func_exit;
+
+normal_return:
+	/*-------------------------------------------------------------*/
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	mtr_commit(&mtr);
+
+	if (prebuilt->n_fetch_cached > 0) {
+		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+		err = DB_SUCCESS;
+	}
+
+/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+	if (err == DB_SUCCESS) {
+		srv_n_rows_read++;
+	}
+
+func_exit:
+	trx->op_info = "";
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***********************************************************************
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache. */
+
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+					/* out: TRUE if storing or retrieving
+					from the query cache is permitted */
+	trx_t*		trx,		/* in: transaction object */
+	const char*	norm_name)	/* in: concatenation of database name,
+					'/' char, table name */
+{
+	dict_table_t*	table;
+	ibool		ret 	= FALSE;
+
+	table = dict_table_get(norm_name, trx);
+
+	if (table == NULL) {
+
+		return(FALSE);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	/* Start the transaction if it is not started yet */
+
+	trx_start_if_not_started_low(trx);
+
+	/* If there are locks on the table or some trx has invalidated the
+	cache up to our trx id, then ret = FALSE.
+	We do not check what type locks there are on the table, though only
+	IX type locks actually would require ret = FALSE. */
+
+	if (UT_LIST_GET_LEN(table->locks) == 0
+	    && ut_dulint_cmp(trx->id, table->query_cache_inv_trx_id) >= 0) {
+
+		ret = TRUE;
+		
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !trx->read_view) {
+
+			trx->read_view = read_view_open_now(trx,
+						trx->read_view_heap);
+		}
+	}
+	
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.c
new file mode 100644
index 00000000000..9dc860d70b1
--- /dev/null
+++ b/storage/innobase/row/row0uins.c
@@ -0,0 +1,308 @@
+/******************************************************
+Fresh insert undo
+
+(c) 1996 Innobase Oy
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*******************************************************************
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached. */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node)	/* in: undo node */
+{
+	btr_cur_t*	btr_cur;		
+	ibool		success;
+	ulint		err;
+	ulint		n_tries		= 0;
+	mtr_t		mtr;
+	
+	mtr_start(&mtr);
+	
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+									&mtr);
+	ut_a(success);
+
+	if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+		/* Drop the index tree associated with the row in
+		SYS_INDEXES table: */
+	
+		dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						&(node->pcur), &mtr);
+		ut_a(success);
+	}
+		
+	btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+	
+	success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (success) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr_start(&mtr);
+	
+	success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+							&(node->pcur), &mtr);
+	ut_a(success);
+
+	btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+				&& n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+			
+		goto retry;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry if found. */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+				/* out: DB_SUCCESS, DB_FAIL, or
+				DB_OUT_OF_FILE_SPACE */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry to remove */
+{
+	btr_pcur_t	pcur;		
+	btr_cur_t*	btr_cur;
+	ibool		found;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+	
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree. */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry to insert */
+{
+	ulint	err;
+	ulint	n_tries	= 0;
+	
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry);
+								
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+			
+		goto retry;
+	}
+
+	return(err);
+}
+
+/***************************************************************
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node)	/* in: row undo node */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	dulint		undo_no;
+	dulint		table_id;
+	ulint		type;
+	ulint		dummy;
+	ibool		dummy_extern;
+
+	ut_ad(node);
+	
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+					&dummy_extern, &undo_no, &table_id);
+	ut_ad(type == TRX_UNDO_INSERT_REC);
+	node->rec_type = type;
+
+	node->table = dict_table_get_on_id(table_id, node->trx);
+
+	if (node->table == NULL) {
+
+		return;
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip undo operations to missing .ibd files */
+		node->table = NULL;
+
+		return;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+	
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+								node->heap);
+}
+	
+/***************************************************************
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. */
+
+ulint
+row_undo_ins(
+/*=========*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node)	/* in: row undo node */
+{
+	dtuple_t*	entry;
+	ibool		found;
+	ulint		err;
+
+	ut_ad(node);
+	ut_ad(node->state == UNDO_NODE_INSERT);
+	
+	row_undo_ins_parse_undo_rec(node);
+
+	if (node->table == NULL) {
+	  	found = FALSE;
+	} else {
+	  	found = row_undo_search_clust_to_pcur(node);
+	}
+
+	if (!found) {
+	        trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_next_index(
+				dict_table_get_first_index(node->table));
+
+	while (node->index != NULL) {
+		entry = row_build_index_entry(node->row, node->index,
+								node->heap);
+		err = row_undo_ins_remove_sec(node->index, entry);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+		
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	err = row_undo_ins_remove_clust_rec(node);
+		
+	return(err);
+}
diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c
new file mode 100644
index 00000000000..1cade0f304f
--- /dev/null
+++ b/storage/innobase/row/row0umod.c
@@ -0,0 +1,773 @@
+/******************************************************
+Undo modify of a row
+
+(c) 1997 Innobase Oy
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/***************************************************************
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback. */
+UNIV_INLINE
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+				/* out: TRUE if also previous modify or
+				insert of this row should be undone */
+ 	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dulint*		undo_no)/* out: the undo number */
+{
+	trx_undo_rec_t*	undo_rec;
+	ibool		ret;
+	trx_t*		trx;
+
+	UT_NOT_USED(thr);
+
+	trx = node->trx;
+	
+	if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+		return(FALSE);
+	}
+
+	undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+	*undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+	if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) {
+		ret = TRUE;
+	} else {
+		ret = FALSE;
+	}
+	
+	return(ret);
+}
+	
+/***************************************************************
+Undoes a modify in a clustered index record. */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+				/* out: DB_SUCCESS, DB_FAIL, or error code:
+				we may run out of file space */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr,	/* in: mtr */
+	ulint		mode)	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	ut_ad(success);
+
+	if (mode == BTR_MODIFY_LEAF) {
+
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+					| BTR_NO_UNDO_LOG_FLAG
+					| BTR_KEEP_SYS_FLAG,
+					btr_cur, node->update,
+					node->cmpl_info, thr, mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
+					| BTR_NO_UNDO_LOG_FLAG
+					| BTR_KEEP_SYS_FLAG,
+					btr_cur, &dummy_big_rec, node->update,
+					node->cmpl_info, thr, mtr);
+	}
+
+	return(err);
+}
+		
+/***************************************************************
+Removes a clustered index record after undo if possible. */
+static
+ulint
+row_undo_mod_remove_clust_low(
+/*==========================*/
+				/* out: DB_SUCCESS, DB_FAIL, or error code:
+				we may run out of file space */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr __attribute__((unused)), /* in: query thread */
+	mtr_t*		mtr,	/* in: mtr */
+	ulint		mode)	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+	
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	if (!success) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* Find out if we can remove the whole clustered index record */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+		/* Ok, we can remove */
+	} else {
+		return(DB_SUCCESS);
+	}
+	    
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* Note that since this operation is analogous to purge,
+		we can free also inherited externally stored fields:
+		hence the last FALSE in the call below */
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, FALSE, mtr);
+
+		/* The delete operation may fail if we have little
+		file space left: TODO: easiest to crash the database
+		and restart with more file space */
+	}
+
+	return(err);
+}
+		
+/***************************************************************
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo. */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+				/* out: DB_SUCCESS or error code: we may run
+				out of file space */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	ulint		err;
+	ibool		success;
+	ibool		more_vers;
+	dulint		new_undo_no;
+	
+	ut_ad(node && thr);
+
+	/* Check if also the previous version of the clustered index record
+	should be undone in this same rollback operation */
+
+	more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no);
+
+	pcur = &(node->pcur);
+
+	mtr_start(&mtr);
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+	
+		mtr_start(&mtr);
+
+		err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+							BTR_MODIFY_LEAF);
+		if (err != DB_SUCCESS) {
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+			/* We may have to modify tree structure: do a
+			pessimistic descent down the index tree */
+
+			mtr_start(&mtr);
+
+			err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+							BTR_MODIFY_TREE);
+		}
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+	}
+
+	node->state = UNDO_NODE_FETCH_NEXT;
+	
+ 	trx_undo_rec_release(node->trx, node->undo_no);
+
+	if (more_vers && err == DB_SUCCESS) {
+
+		/* Reserve the undo log record to the prior version after
+		committing &mtr: this is necessary to comply with the latching
+		order, as &mtr may contain the fsp latch which is lower in
+		the latch hierarchy than trx->undo_mutex. */
+
+		success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+		if (success) {
+			node->state = UNDO_NODE_PREV_VERS;
+		}
+	}
+
+	return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found. */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+				/* out: DB_SUCCESS, DB_FAIL, or
+				DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry */
+	ulint		mode)	/* in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */	
+{
+	ibool		found;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+	
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_start(&mtr_vers);
+		
+	success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+								&mtr_vers);
+	ut_a(success);
+		
+	old_has = row_vers_old_has_index_entry(FALSE,
+					btr_pcur_get_rec(&(node->pcur)),
+					&mtr_vers, index, entry);
+	if (old_has) {
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, TRUE, thr, &mtr);
+		ut_ad(err == DB_SUCCESS);
+	} else {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {		
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+			if (success) {
+				err = DB_SUCCESS;
+			} else {
+				err = DB_FAIL;
+			}
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+								TRUE, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.c, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked. */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry */
+{
+	ulint	err;
+	
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ 						entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ 						entry, BTR_MODIFY_TREE);
+ 	return(err);
+}
+
+/***************************************************************
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. */
+static
+ulint
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+				/* out: DB_FAIL or DB_SUCCESS or
+				DB_OUT_OF_FILE_SPACE */
+	ulint		mode,	/* in: search mode: BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	upd_t*		update;
+	ulint		err		= DB_SUCCESS;
+	ibool		found;
+	big_rec_t*	dummy_big_rec;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	if (!found) {
+		fputs("InnoDB: error in sec index entry del undo in\n"
+			"InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+			"InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+			"InnoDB: record ", stderr);
+		rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+		putc('\n', stderr);
+		trx_print(stderr, trx);
+		fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+	} else {
+		btr_cur_t*	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	        err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, FALSE, thr, &mtr);
+	        ut_a(err == DB_SUCCESS);
+		heap = mem_heap_create(100);
+
+		update = row_upd_build_sec_rec_difference_binary(index, entry,
+			btr_cur_get_rec(btr_cur), trx, heap);
+	        if (upd_get_n_fields(update) == 0) {
+
+			/* Do nothing */
+		
+		} else if (mode == BTR_MODIFY_LEAF) {
+                	/* Try an optimistic updating of the record, keeping
+			changes within the page */
+
+                	err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG
+							| BTR_NO_LOCKING_FLAG,
+ 						btr_cur, update, 0, thr, &mtr);
+                	if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
+                        	err = DB_FAIL;
+                	}
+       		} else  {
+                	ut_a(mode == BTR_MODIFY_TREE);
+                	err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG
+							| BTR_NO_LOCKING_FLAG,
+						btr_cur, &dummy_big_rec,
+						update, 0, thr, &mtr);
+        	}			
+
+		mem_heap_free(heap);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_DEL. */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+	
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, index, heap);
+
+		err = row_undo_mod_del_mark_or_remove_sec(node, thr, index,
+								entry);
+		if (err != DB_SUCCESS) {
+
+			mem_heap_free(heap);
+
+			return(err);
+		}
+									
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is DEL_MARK. */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, index, heap);
+		
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+						BTR_MODIFY_LEAF,
+						thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+						BTR_MODIFY_TREE,
+						thr, index, entry);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			mem_heap_free(heap);
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);	
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		/* No change in secondary indexes */
+	
+		return(DB_SUCCESS);
+	}
+	
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field_binary(node->row, node->index,
+							node->update)) {
+
+			/* Build the newest version of the index entry */
+			entry = row_build_index_entry(node->row, index, heap);
+
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the original
+			values because we do not know them. But this should
+			not cause problems because in row0sel.c, in queries
+			we always retrieve the clustered index record or an
+			earlier version of it, if the secondary index record
+			through which we do the search is delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(node, thr,
+								index, entry);
+			if (err != DB_SUCCESS) {
+				mem_heap_free(heap);
+
+				return(err);
+			}
+
+			/* We may have to update the delete mark in the
+			secondary index record of the previous version of
+			the row. We also need to update the fields of
+			the secondary index record if we updated its fields
+			but alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc'. */
+
+			row_upd_index_replace_new_col_vals(entry, index,
+							node->update, NULL);
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+						BTR_MODIFY_LEAF,
+						thr, index, entry);
+			if (err == DB_FAIL) {
+				err =
+				   row_undo_mod_del_unmark_sec_and_undo_update(
+						BTR_MODIFY_TREE,
+						thr, index, entry);
+			}
+
+			if (err != DB_SUCCESS) {
+				mem_heap_free(heap);
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	dulint		undo_no;
+	dulint		table_id;
+	dulint		trx_id;
+	dulint		roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	ibool		dummy_extern;
+	trx_t*		trx;
+
+	ut_ad(node && thr);
+	trx = thr_get_trx(thr);
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+					&dummy_extern, &undo_no, &table_id);
+	node->rec_type = type;
+	
+	node->table = dict_table_get_on_id(table_id, trx);
+
+	/* TODO: other fixes associated with DROP TABLE + rollback in the
+	same table by another user */
+
+	if (node->table == NULL) {
+	        /* Table was dropped */
+	        return;
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip undo operations to missing .ibd files */
+		node->table = NULL;
+
+		return;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+								&info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+								node->heap);
+
+	trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					roll_ptr, info_bits, trx,
+					node->heap, &(node->update));
+	node->new_roll_ptr = roll_ptr;
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+}
+	
+/***************************************************************
+Undoes a modify operation on a row of a table. */
+
+ulint
+row_undo_mod(
+/*=========*/
+				/* out: DB_SUCCESS or error code */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ibool	found;
+	ulint	err;
+	
+	ut_ad(node && thr);
+	ut_ad(node->state == UNDO_NODE_MODIFY);
+
+	row_undo_mod_parse_undo_rec(node, thr);
+
+	if (node->table == NULL) {
+		found = FALSE;
+	} else {
+		found = row_undo_search_clust_to_pcur(node);
+	}
+
+	if (!found) {
+		/* It is already undone, or will be undone by another query
+		thread, or table was dropped */
+	
+	        trx_undo_rec_release(node->trx, node->undo_no);
+		node->state = UNDO_NODE_FETCH_NEXT;
+
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_next_index(
+				dict_table_get_first_index(node->table));
+
+	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+		
+		err = row_undo_mod_upd_exist_sec(node, thr);
+
+	} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+		err = row_undo_mod_del_mark_sec(node, thr);
+	} else {
+		ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+		err = row_undo_mod_upd_del_sec(node, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+	
+	err = row_undo_mod_clust(node, thr);
+	
+	return(err);
+}
diff --git a/storage/innobase/row/row0undo.c b/storage/innobase/row/row0undo.c
new file mode 100644
index 00000000000..abe73cbe705
--- /dev/null
+++ b/storage/innobase/row/row0undo.c
@@ -0,0 +1,350 @@
+/******************************************************
+Row undo
+
+(c) 1997 Innobase Oy
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/************************************************************************
+Creates a row undo node to a query graph. */
+
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+				/* out, own: undo node */
+	trx_t*		trx,	/* in: transaction */
+	que_thr_t*	parent,	/* in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/* in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx && parent && heap);
+
+	undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->state = UNDO_NODE_FETCH_NEXT;
+	undo->trx = trx;
+
+	btr_pcur_init(&(undo->pcur));
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***************************************************************
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case. */
+
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+				/* out: TRUE if found; NOTE the node->pcur
+				must be closed by the caller, regardless of
+				the return value */
+	undo_node_t*	node)	/* in: row undo node */
+{
+	dict_index_t*	clust_index;
+	ibool		found;
+	mtr_t		mtr;
+	ibool		ret;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+	
+	found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+					node->table, node->ref, &mtr);
+
+	rec = btr_pcur_get_rec(&(node->pcur));
+
+	offsets = rec_get_offsets(rec, clust_index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+	if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+			row_get_rec_roll_ptr(rec, clust_index, offsets))) {
+
+		/* We must remove the reservation on the undo log record
+		BEFORE releasing the latch on the clustered index page: this
+		is to make sure that some thread will eventually undo the
+		modification corresponding to node->roll_ptr. */
+		
+		/* fputs("--------------------undoing a previous version\n",
+			stderr); */
+		   
+		ret = FALSE;
+	} else {
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+						offsets, node->heap);
+		btr_pcur_store_position(&(node->pcur), &mtr);
+
+		ret = TRUE;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+	
+/***************************************************************
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_undo(
+/*=====*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+	trx_t*	trx;
+	dulint	roll_ptr;
+	ibool	froze_data_dict	= FALSE;
+	
+	ut_ad(node && thr);
+	
+	trx = node->trx;
+
+	if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+							trx->roll_limit,
+							&roll_ptr,
+							node->heap);
+		if (!node->undo_rec) {
+			/* Rollback completed for this query thread */
+
+			thr->run_node = que_node_get_parent(node);
+
+			return(DB_SUCCESS);
+		}
+
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+
+	} else if (node->state == UNDO_NODE_PREV_VERS) {
+
+		/* Undo should be done to the same clustered index record
+		again in this same rollback, restoring the previous version */
+
+		roll_ptr = node->new_roll_ptr;
+		
+		node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+								node->heap);
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+		
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+	}
+
+	/* Prevent DROP TABLE etc. while we are rolling back this row.
+        If we are doing a TABLE CREATE or some other dictionary operation,
+        then we already have dict_operation_lock locked in x-mode. Do not
+        try to lock again in s-mode, because that would cause a hang. */
+
+	if (trx->dict_operation_lock_mode == 0) {
+        
+	        row_mysql_freeze_data_dictionary(trx);
+
+	        froze_data_dict = TRUE;
+	}
+
+	if (node->state == UNDO_NODE_INSERT) {
+
+		err = row_undo_ins(node);
+
+		node->state = UNDO_NODE_FETCH_NEXT;
+	} else {
+		ut_ad(node->state == UNDO_NODE_MODIFY);
+		err = row_undo_mod(node, thr);
+	}
+
+	if (froze_data_dict) {
+
+	        row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+	
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***************************************************************
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs. */
+
+que_thr_t*
+row_undo_step(
+/*==========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint		err;
+	undo_node_t*	node;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	srv_activity_count++;
+	
+	trx = thr_get_trx(thr);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	err = row_undo(node, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* SQL error detected */
+
+		fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
+			(ulong) err);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			fprintf(stderr,
+			"InnoDB: Error 13 means out of tablespace.\n"
+			"InnoDB: Consider increasing your tablespace.\n");
+
+			exit(1);			
+		}
+		
+		ut_error;
+
+		return(NULL);
+	}
+
+	return(thr);
+} 
diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c
new file mode 100644
index 00000000000..3305724a89b
--- /dev/null
+++ b/storage/innobase/row/row0upd.c
@@ -0,0 +1,2035 @@
+/******************************************************
+Update of a row
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/***************************************************************
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes. */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+				/* out: TRUE if changes */
+	dtuple_t*	entry,	/* in: old value of index entry */
+	dict_index_t*	index,	/* in: index of entry */
+	upd_t*		update,	/* in: update vector for the row */
+	ulint		n);	/* in: how many first fields to check */
+
+
+/*************************************************************************
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint. */
+static
+ibool
+row_upd_index_is_referenced(
+/*========================*/
+				/* out: TRUE if referenced; NOTE that since
+				we do not hold dict_operation_lock
+				when leaving the function, it may be that
+				the referencing table has been dropped when
+				we leave this function: this function is only
+				for heuristic use! */
+	dict_index_t*	index,	/* in: index */
+	trx_t*		trx)	/* in: transaction */
+{
+	dict_table_t*	table		= index->table;
+	dict_foreign_t*	foreign;
+	ibool		froze_data_dict	= FALSE;
+
+	if (!UT_LIST_GET_FIRST(table->referenced_list)) {
+
+		return(FALSE);
+	}
+
+	if (trx->dict_operation_lock_mode == 0) {
+		row_mysql_freeze_data_dictionary(trx);
+		froze_data_dict = TRUE;
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		if (foreign->referenced_index == index) {
+
+			if (froze_data_dict) {
+				row_mysql_unfreeze_data_dictionary(trx);
+			}
+
+			return(TRUE);
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+	
+	if (froze_data_dict) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************************
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur. NOTE that this function will temporarily commit mtr and lose the
+pcur position! */
+static
+ulint
+row_upd_check_references_constraints(
+/*=================================*/
+				/* out: DB_SUCCESS or an error code */
+	upd_node_t*	node,	/* in: row update node */
+	btr_pcur_t*	pcur,	/* in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/* in: table in question */
+	dict_index_t*	index,	/* in: index of the cursor */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	trx_t*		trx;
+	rec_t*		rec;
+	ulint		err;
+	ibool		got_s_lock	= FALSE;
+
+	if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx = thr_get_trx(thr);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
+
+	mtr_commit(mtr);	
+
+	mtr_start(mtr);	
+	
+	if (trx->dict_operation_lock_mode == 0) {
+		got_s_lock = TRUE;
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+		
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->referenced_index == index
+		    && (node->is_delete
+		       || row_upd_changes_first_fields_binary(entry, index,
+			    		node->update, foreign->n_fields))) {
+			    				
+			if (foreign->foreign_table == NULL) {
+				dict_table_get(foreign->foreign_table_name,
+									trx);
+			}
+
+			if (foreign->foreign_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				(foreign->foreign_table
+				->n_foreign_key_checks_running)++;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects 'foreign' from
+			being dropped while the check is running. */
+			
+			err = row_ins_check_foreign_constraint(FALSE, foreign,
+							table, entry, thr);
+
+			if (foreign->foreign_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				ut_a(foreign->foreign_table
+				->n_foreign_key_checks_running > 0);
+
+				(foreign->foreign_table
+				->n_foreign_key_checks_running)--;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			if (err != DB_SUCCESS) {
+				if (got_s_lock) {
+					row_mysql_unfreeze_data_dictionary(
+									trx);
+				}
+
+				mem_heap_free(heap);
+
+				return(err);
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (got_s_lock) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+	
+	return(DB_SUCCESS);
+}
+
+/*************************************************************************
+Creates an update node for a query graph. */
+
+upd_node_t*
+upd_node_create(
+/*============*/
+				/* out, own: update node */
+	mem_heap_t*	heap)	/* in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(upd_node_t));
+	node->common.type = QUE_NODE_UPDATE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->select_will_do_update = FALSE;
+	node->in_mysql_interface = FALSE;
+
+	node->row = NULL;
+	node->ext_vec = NULL;
+	node->index = NULL;
+	node->update = NULL;
+	
+	node->foreign = NULL;
+	node->cascade_heap = NULL;
+	node->cascade_node = NULL;
+	
+	node->select = NULL;
+	
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;	
+
+	node->cmpl_info = 0;
+	
+	return(node);
+}
+
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*		rec,	/* in: record */
+	const ulint*	offsets,/* in: array returned by rec_get_offsets() */
+	ulint		pos,	/* in: TRX_ID position in rec */
+	dulint		trx_id,	/* in: transaction id */
+	dulint		roll_ptr)/* in: roll ptr of the undo log record */
+{
+	byte*	field;
+	ulint	len;
+
+	field = rec_get_nth_field(rec, offsets, pos, &len);
+	ut_ad(len == DATA_TRX_ID_LEN);
+	trx_write_trx_id(field, trx_id);
+
+	field = rec_get_nth_field(rec, offsets, pos + 1, &len);
+	ut_ad(len == DATA_ROLL_PTR_LEN);
+	trx_write_roll_ptr(field, roll_ptr);
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field of a clustered index entry. */
+
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	dtuple_t*	entry,	/* in: index entry, where the memory buffers
+				for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/* in: clustered index */
+	ulint		type,	/* in: DATA_TRX_ID or DATA_ROLL_PTR */
+	dulint		val)	/* in: value to write */
+{
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		pos;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	dfield = dtuple_get_nth_field(entry, pos);
+	field = dfield_get_data(dfield);
+
+	if (type == DATA_TRX_ID) {
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/***************************************************************
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update. */
+
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+				/* out: TRUE if the update changes the size of
+				some field in index or the field is external
+				in rec or update */
+	dict_index_t*	index,	/* in: index */
+	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
+	upd_t*		update)	/* in: update vector */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		old_len;
+	ulint		new_len;
+	ulint		n_fields;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+		new_len = new_val->len;
+
+		if (new_len == UNIV_SQL_NULL && !rec_offs_comp(offsets)) {
+			/* A bug fixed on Dec 31st, 2004: we looked at the
+			SQL NULL size from the wrong field! We may backport
+			this fix also to 4.0. The merge to 5.0 will be made
+			manually immediately after we commit this to 4.1. */
+
+			new_len = dtype_get_sql_null_size(
+					dict_index_get_nth_type(index,
+						upd_field->field_no));
+		}
+
+		old_len = rec_offs_nth_size(offsets, upd_field->field_no);
+
+		if (old_len != new_len) {
+
+			return(TRUE);
+		}
+		
+		if (rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+			return(TRUE);
+		}
+
+		if (upd_field->extern_storage) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. This function is used only for
+a clustered index */
+
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*		rec,	/* in/out: record where replaced */
+	const ulint*	offsets,/* in: array returned by rec_get_offsets() */
+	upd_t*		update)	/* in: update vector */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		n_fields;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits);
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		
+		rec_set_nth_field(rec, offsets, upd_field->field_no,
+						dfield_get_data(new_val),
+						dfield_get_len(new_val));
+	}
+}
+
+/*************************************************************************
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record. */
+
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+				/* out: new pointer to mlog */
+	dict_index_t*	index,	/* in: clustered index */
+	trx_t*		trx,	/* in: transaction */
+	dulint		roll_ptr,/* in: roll ptr of the undo log record */
+	byte*		log_ptr,/* pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr __attribute__((unused))) /* in: mtr */
+{
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(mtr);
+
+	log_ptr += mach_write_compressed(log_ptr,
+			dict_index_get_sys_col_pos(index, DATA_TRX_ID));
+
+	trx_write_roll_ptr(log_ptr, roll_ptr);
+	log_ptr += DATA_ROLL_PTR_LEN;	
+
+	log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+	return(log_ptr);
+}
+
+/*************************************************************************
+Parses the log data of system field values. */
+
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+			/* out: log data end or NULL */
+	byte*	ptr,	/* in: buffer */
+	byte*	end_ptr,/* in: buffer end */
+	ulint*	pos,	/* out: TRX_ID position in record */
+	dulint*	trx_id,	/* out: trx id */
+	dulint*	roll_ptr)/* out: roll ptr */
+{
+	ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+		return(NULL);
+	}
+	
+	*roll_ptr = trx_read_roll_ptr(ptr);
+	ptr += DATA_ROLL_PTR_LEN;	
+
+	ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+	return(ptr);
+}
+
+/***************************************************************
+Writes to the redo log the new values of the fields occurring in the index. */
+
+void
+row_upd_index_write_log(
+/*====================*/
+	upd_t*	update,	/* in: update vector */
+	byte*	log_ptr,/* in: pointer to mlog buffer: must contain at least
+			MLOG_BUF_MARGIN bytes of free space; the buffer is
+			closed within this function */
+	mtr_t*	mtr)	/* in: mtr into whose log to write */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	byte*		buf_end;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	buf_end = log_ptr + MLOG_BUF_MARGIN;
+	
+	mach_write_to_1(log_ptr, update->info_bits);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, n_fields);
+	
+	for (i = 0; i < n_fields; i++) {
+
+		ut_ad(MLOG_BUF_MARGIN > 30);
+
+		if (log_ptr + 30 > buf_end) {
+			mlog_close(mtr, log_ptr);
+			
+			log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+			buf_end = log_ptr + MLOG_BUF_MARGIN;
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+
+		len = new_val->len;
+
+		log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+		log_ptr += mach_write_compressed(log_ptr, len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (log_ptr + len < buf_end) {
+				ut_memcpy(log_ptr, new_val->data, len);
+
+				log_ptr += len;
+			} else {
+				mlog_close(mtr, log_ptr);
+			
+				mlog_catenate_string(mtr, new_val->data, len);
+
+				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+				buf_end = log_ptr + MLOG_BUF_MARGIN;
+			}
+		}
+	}
+
+	mlog_close(mtr, log_ptr);
+}
+
+/*************************************************************************
+Parses the log data written by row_upd_index_write_log. */
+
+byte*
+row_upd_index_parse(
+/*================*/
+				/* out: log data end or NULL */
+	byte*		ptr,	/* in: buffer */
+	byte*		end_ptr,/* in: buffer end */
+	mem_heap_t*	heap,	/* in: memory heap where update vector is
+				built */
+	upd_t**		update_out)/* out: update vector */
+{
+	upd_t*		update;
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	byte*		buf;
+	ulint		info_bits;
+	ulint		i;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	info_bits = mach_read_from_1(ptr);
+	ptr++;
+	ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	update = upd_create(n_fields, heap);
+	update->info_bits = info_bits;
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+
+		ptr = mach_parse_compressed(ptr, end_ptr,
+						&(upd_field->field_no));
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		new_val->len = len;
+
+		if (len != UNIV_SQL_NULL) {
+
+			if (end_ptr < ptr + len) {
+
+				return(NULL);
+			} else {
+				buf = mem_heap_alloc(heap, len);
+				ut_memcpy(buf, ptr, len);
+
+				ptr += len;
+
+				new_val->data = buf;
+			}
+		}
+	}
+
+	*update_out = update;
+
+	return(ptr);
+}
+
+/*******************************************************************
+Returns TRUE if ext_vec contains i. */
+static
+ibool
+upd_ext_vec_contains(
+/*=================*/
+				/* out: TRUE if i is in ext_vec */
+	ulint*	ext_vec,	/* in: array of indexes or NULL */
+	ulint	n_ext_vec,	/* in: number of numbers in ext_vec */
+	ulint	i)		/* in: a number */
+{
+	ulint	j;
+
+	if (ext_vec == NULL) {
+
+		return(FALSE);
+	}
+
+	for (j = 0; j < n_ext_vec; j++) {
+		if (ext_vec[j] == i) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+	
+/*******************************************************************
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings! */
+
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+				/* out, own: update vector of differing
+				fields */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: entry to insert */
+	rec_t*		rec,	/* in: secondary index record */
+	trx_t*		trx,	/* in: transaction */
+	mem_heap_t*	heap)	/* in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	byte*		data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_SMALL_SIZE];
+	const ulint*	offsets;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	/* This function is used only for a secondary index */
+	ut_a(0 == (index->type & DICT_CLUSTERED));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+	offsets = rec_get_offsets(rec, index, offsets_,
+						ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE that it may be that len != dfield_get_len(dfield) if we
+		are updating in a character set and collation where strings of
+		different length can be equal in an alphabetical comparison,
+		and also in the case where we have a column prefix index
+		and the last characters in the index field are spaces; the
+		latter case probably caused the assertion failures reported at
+		row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			upd_field->extern_storage = FALSE;
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/*******************************************************************
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings! */
+
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+				/* out, own: update vector of differing
+				fields, excluding roll ptr and trx id */
+	dict_index_t*	index,	/* in: clustered index */
+	dtuple_t*	entry,	/* in: entry to insert */
+	ulint*		ext_vec,/* in: array containing field numbers of
+				externally stored fields in entry, or NULL */
+	ulint		n_ext_vec,/* in: number of fields in ext_vec */
+	rec_t*		rec,	/* in: clustered index record */
+	trx_t*		trx,	/* in: transaction */
+	mem_heap_t*	heap)	/* in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	byte*		data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		roll_ptr_pos;
+	ulint		trx_id_pos;
+	ibool		extern_bit;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	/* This function is used only for a clustered index */
+	ut_a(index->type & DICT_CLUSTERED);
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	offsets = rec_get_offsets(rec, index, offsets_,
+				ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (i == trx_id_pos || i == roll_ptr_pos) {
+
+			goto skip_compare;
+		}
+
+		extern_bit = rec_offs_nth_extern(offsets, i);
+		
+		if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i)
+		    || !dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) {
+				upd_field->extern_storage = TRUE;
+			} else {
+				upd_field->extern_storage = FALSE;
+			}
+				
+			n_diff++;
+		}
+skip_compare:
+		;
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+	dtuple_t*	entry,	/* in/out: index entry where replaced */
+	dict_index_t*	index,	/* in: index; NOTE that this may also be a
+				non-clustered index */
+	upd_t*		update,	/* in: an update vector built for the index so
+				that the field number in an upd_field is the
+				index position */
+	mem_heap_t*	heap)	/* in: memory heap to which we allocate and
+				copy the new values, set this as NULL if you
+				do not want allocation */
+{
+	dict_field_t*	field;
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	dfield_t*	new_val;
+	ulint		j;
+	ulint		i;
+	dtype_t*	cur_type;
+
+	ut_ad(index);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (j = 0; j < dict_index_get_n_fields(index); j++) {
+
+	        field = dict_index_get_nth_field(index, j);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		        upd_field = upd_get_nth_field(update, i);
+
+			if (upd_field->field_no == j) {
+
+			        dfield = dtuple_get_nth_field(entry, j);
+
+				new_val = &(upd_field->new_val);
+
+				dfield_set_data(dfield, new_val->data,
+								new_val->len);
+				if (heap && new_val->len != UNIV_SQL_NULL) {
+				        dfield->data = mem_heap_alloc(heap,
+								new_val->len);
+					ut_memcpy(dfield->data, new_val->data,
+								new_val->len);
+				}
+
+				if (field->prefix_len > 0
+			            && new_val->len != UNIV_SQL_NULL) {
+
+				  	cur_type = dict_col_get_type(
+						dict_field_get_col(field));
+
+				  	dfield->len = 
+				    		dtype_get_at_most_n_mbchars(
+				      			cur_type,
+							field->prefix_len,
+							new_val->len,
+							new_val->data);
+				}
+			}
+		}
+	}
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/* in/out: index entry where replaced */
+	dict_index_t*	index,	/* in: index; NOTE that this may also be a
+				non-clustered index */
+	upd_t*		update,	/* in: an update vector built for the
+				CLUSTERED index so that the field number in
+				an upd_field is the clustered index position */
+	mem_heap_t*	heap)	/* in: memory heap to which we allocate and
+				copy the new values, set this as NULL if you
+				do not want allocation */
+{
+	dict_field_t*	field;
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	dfield_t*	new_val;
+	ulint		j;
+	ulint		i;
+	dtype_t*	cur_type;
+
+	ut_ad(index);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (j = 0; j < dict_index_get_n_fields(index); j++) {
+
+	        field = dict_index_get_nth_field(index, j);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		        upd_field = upd_get_nth_field(update, i);
+
+			if (upd_field->field_no == field->col->clust_pos) {
+
+			        dfield = dtuple_get_nth_field(entry, j);
+
+				new_val = &(upd_field->new_val);
+
+				dfield_set_data(dfield, new_val->data,
+								new_val->len);
+				if (heap && new_val->len != UNIV_SQL_NULL) {
+				        dfield->data = mem_heap_alloc(heap,
+								new_val->len);
+					ut_memcpy(dfield->data, new_val->data,
+								new_val->len);
+				}
+
+				if (field->prefix_len > 0
+			            && new_val->len != UNIV_SQL_NULL) {
+
+					cur_type = dict_col_get_type(
+						dict_field_get_col(field));
+
+				  	dfield->len =
+				    		dtype_get_at_most_n_mbchars(
+				      			cur_type,
+							field->prefix_len,
+							new_val->len,
+							new_val->data);
+				}
+			}
+		}
+	}
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings! */
+
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+				/* out: TRUE if update vector changes
+				an ordering field in the index record;
+				NOTE: the fields are compared as binary
+				strings */
+	dtuple_t*	row,	/* in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	dict_index_t*	index,	/* in: index of the record */
+	upd_t*		update)	/* in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+{
+	upd_field_t*	upd_field;
+	dict_field_t*	ind_field;
+	dict_col_t*	col;
+	ulint		n_unique;
+	ulint		n_upd_fields;
+	ulint		col_pos;
+	ulint		col_no;
+	ulint		i, j;
+	
+	ut_ad(update && index);
+
+	n_unique = dict_index_get_n_unique(index);
+	n_upd_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_unique; i++) {
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col);
+		col_no = dict_col_get_no(col);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field = upd_get_nth_field(update, j);
+
+			/* Note that if the index field is a column prefix
+			then it may be that row does not contain an externally
+			stored part of the column value, and we cannot compare
+			the datas */
+
+			if (col_pos == upd_field->field_no
+			    && (row == NULL
+			        || ind_field->prefix_len > 0
+				|| !dfield_datas_are_binary_equal(
+					dtuple_get_nth_field(row, col_no),
+						&(upd_field->new_val)))) {
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings! */
+
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+				/* out: TRUE if update vector may change
+				an ordering field in an index record */
+	dict_table_t*	table,	/* in: table */
+	upd_t*		update)	/* in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_index_t*	index;
+	ulint		i;
+	
+	index = dict_table_get_first_index(table);
+	
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		if (dict_field_get_col(dict_index_get_nth_field(index,
+						upd_field->field_no))
+		    ->ord_part) {
+
+		    	return(TRUE);
+		}
+	}
+	
+	return(FALSE);
+}
+
+/***************************************************************
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes. */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+				/* out: TRUE if changes */
+	dtuple_t*	entry,	/* in: index entry */
+	dict_index_t*	index,	/* in: index of entry */
+	upd_t*		update,	/* in: update vector for the row */
+	ulint		n)	/* in: how many first fields to check */
+{
+	upd_field_t*	upd_field;
+	dict_field_t*	ind_field;
+	dict_col_t*	col;
+	ulint		n_upd_fields;
+	ulint		col_pos;
+	ulint		i, j;
+	
+	ut_a(update && index);
+	ut_a(n <= dict_index_get_n_fields(index));
+	
+	n_upd_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n; i++) {
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col);
+
+		ut_a(ind_field->prefix_len == 0);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field = upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			    && !dfield_datas_are_binary_equal(
+					     dtuple_get_nth_field(entry, i),
+					     &(upd_field->new_val))) {
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************************
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/* in: record in a clustered index */
+	const ulint*	offsets,/* in: array returned by rec_get_offsets() */
+	sym_node_t*	column)	/* in: first column in a column list, or
+				NULL */
+{
+	byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_field(rec, offsets,
+					column->field_nos[SYM_CLUST_FIELD_NO],
+									&len);
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*************************************************************************
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/* in: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/***************************************************************
+Stores to the heap the row on which the node->pcur is positioned. */
+static
+void
+row_upd_store_row(
+/*==============*/
+	upd_node_t*	node)	/* in: row update node */
+{
+	dict_index_t*	clust_index;
+	upd_t*		update;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+		node->row = NULL;
+	}
+	
+	clust_index = dict_table_get_first_index(node->table);
+
+	rec = btr_pcur_get_rec(node->pcur);
+	
+	offsets = rec_get_offsets(rec, clust_index, offsets_,
+						ULINT_UNDEFINED, &heap);
+	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+								node->heap);
+	node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint)
+					* rec_offs_n_fields(offsets));
+	if (node->is_delete) {
+		update = NULL;
+	} else {
+		update = node->update;
+	}
+	
+	node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec,
+						offsets, update);
+	if (heap) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************
+Updates a secondary index entry of a row. */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ibool		check_ref;
+	ibool		found;
+	dict_index_t*	index;
+	dtuple_t*	entry;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	mem_heap_t*	heap;
+	rec_t*		rec;
+	ulint		err	= DB_SUCCESS;
+	mtr_t		mtr;
+	trx_t*		trx	= thr_get_trx(thr);
+
+	index = node->index;
+	
+	check_ref = row_upd_index_is_referenced(index, trx);
+
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, index, heap);
+
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+									&mtr);
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	rec = btr_cur_get_rec(btr_cur);
+
+	if (!found) {
+		fputs("InnoDB: error in sec index entry update in\n"
+			"InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+			"InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+			"InnoDB: record ", stderr);
+		rec_print(stderr, rec, index);
+		putc('\n', stderr);
+
+		trx_print(stderr, trx);
+
+		fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+	} else {
+ 	  	/* Delete mark the old index record; it can already be
+          	delete marked if we return after a lock wait in
+          	row_ins_index_entry below */
+
+		if (!rec_get_deleted_flag(rec, index->table->comp)) {
+			err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
+								thr, &mtr);
+			if (err == DB_SUCCESS && check_ref) {
+			    	
+				/* NOTE that the following call loses
+				the position of pcur ! */
+				err = row_upd_check_references_constraints(
+							node,
+							&pcur, index->table,
+							index, thr, &mtr);
+				if (err != DB_SUCCESS) {
+
+					goto close_cur;
+				}
+			}
+
+	  	}
+	}
+close_cur:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete || err != DB_SUCCESS) {
+
+		mem_heap_free(heap);	
+
+        	return(err);
+	}
+
+	/* Build a new index entry */
+	row_upd_index_replace_new_col_vals(entry, index, node->update, NULL);
+
+	/* Insert new index entry */
+	err = row_ins_index_entry(index, entry, NULL, 0, thr);
+
+	mem_heap_free(heap);	
+
+        return(err);
+}
+
+/***************************************************************
+Updates secondary index record if it is changed in the row update. This
+should be quite rare in database applications. */
+UNIV_INLINE
+ulint
+row_upd_sec_step(
+/*=============*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+				|| (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!(node->index->type & DICT_CLUSTERED));
+	
+	if (node->state == UPD_NODE_UPDATE_ALL_SEC
+	    || row_upd_changes_ord_field_binary(node->row, node->index,
+							   node->update)) {
+		err = row_upd_sec_index_entry(node, thr);
+
+		return(err);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications. */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	dict_index_t*	index,	/* in: clustered index of the record */
+	que_thr_t*	thr,	/* in: query thread */
+	ibool		check_ref,/* in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/* in: mtr; gets committed here */
+{
+	mem_heap_t*	heap	= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	ulint		err;
+	
+	ut_ad(node);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+	
+	if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+		ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+		*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+		err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, TRUE, thr, mtr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+			return(err);
+		}
+
+		/* Mark as not-owned the externally stored fields which the new
+		row inherits from the delete marked record: purge should not
+		free those externally stored fields even if the delete marked
+		record is removed from the index tree, or updated. */
+
+		btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur),
+				rec_get_offsets(btr_cur_get_rec(btr_cur),
+				dict_table_get_first_index(table), offsets_,
+				ULINT_UNDEFINED, &heap), node->update, mtr);
+		if (check_ref) {
+			/* NOTE that the following call loses
+			the position of pcur ! */
+			err = row_upd_check_references_constraints(node,
+							pcur, table,
+							index, thr, mtr);
+			if (err != DB_SUCCESS) {
+				mtr_commit(mtr);
+				if (heap) {
+					mem_heap_free(heap);
+				}
+				return(err);
+			}
+		}
+
+	} 
+
+	mtr_commit(mtr);
+
+	if (!heap) {
+		heap = mem_heap_create(500);
+	}
+	node->state = UPD_NODE_INSERT_CLUSTERED;
+
+	entry = row_build_index_entry(node->row, index, heap);
+
+	row_upd_index_replace_new_col_vals(entry, index, node->update, NULL);
+	
+	row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+	
+	/* If we return from a lock wait, for example, we may have
+	extern fields marked as not-owned in entry (marked in the
+	if-branch above). We must unmark them. */
+	
+	btr_cur_unmark_dtuple_extern_fields(entry, node->ext_vec,
+							node->n_ext_vec);
+	/* We must mark non-updated extern fields in entry as inherited,
+	so that a possible rollback will not free them */
+	
+	btr_cur_mark_dtuple_inherited_extern(entry, node->ext_vec,
+						node->n_ext_vec,
+						node->update);
+	
+	err = row_ins_index_entry(index, entry, node->ext_vec,
+						node->n_ext_vec, thr);
+	mem_heap_free(heap);
+	
+	return(err);
+}
+
+/***************************************************************
+Updates a clustered index record of a row when the ordering fields do
+not change. */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	dict_index_t*	index,	/* in: clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr; gets committed here */
+{
+	big_rec_t*	big_rec	= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	
+	ut_ad(node);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					index->table->comp));
+	
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	} else {
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	}
+
+	mtr_commit(mtr);
+	
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr_start(mtr);
+	
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					index->table->comp));
+	
+	err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+					&big_rec, node->update,
+					node->cmpl_info, thr, mtr);
+	mtr_commit(mtr);
+
+	if (err == DB_SUCCESS && big_rec) {
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_t*		rec;
+		*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+		mtr_start(mtr);
+
+		ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+		rec = btr_cur_get_rec(btr_cur);
+		err = btr_store_big_rec_extern_fields(index, rec,
+			rec_get_offsets(rec, index, offsets_,
+				ULINT_UNDEFINED, &heap),
+			big_rec, mtr);
+		if (heap) {
+			mem_heap_free(heap);
+		}
+		mtr_commit(mtr);
+	}
+
+	if (big_rec) {
+		dtuple_big_rec_free(big_rec);
+	}
+		
+	return(err);
+}
+
+/***************************************************************
+Delete marks a clustered index record. */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code */
+	upd_node_t*	node,	/* in: row update node */
+	dict_index_t*	index,	/* in: clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	ibool		check_ref,/* in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/* in: mtr; gets committed here */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	
+	ut_ad(node);
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(node->is_delete);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Store row because we have to build also the secondary index
+	entries */
+	
+	row_upd_store_row(node);
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, TRUE, thr, mtr);
+	if (err == DB_SUCCESS && check_ref) {
+		/* NOTE that the following call loses the position of pcur ! */
+
+		err = row_upd_check_references_constraints(node,
+							pcur, index->table,
+							index, thr, mtr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	}
+
+	mtr_commit(mtr);
+	
+	return(err);
+}
+
+/***************************************************************
+Updates the clustered index record. */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, DB_LOCK_WAIT in case of a lock wait,
+				else error code */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	ibool		success;
+	ibool		check_ref;
+	ulint		err;
+	mtr_t*		mtr;
+	mtr_t		mtr_buf;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	index = dict_table_get_first_index(node->table);
+
+	check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+	mtr = &mtr_buf;
+
+	mtr_start(mtr);
+	
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+	
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+	if (!success) {
+		err = DB_RECORD_NOT_FOUND;
+
+		mtr_commit(mtr);
+
+		return(err);
+	}
+
+	/* If this is a row in SYS_INDEXES table of the data dictionary,
+	then we have to free the file segments of the index tree associated
+	with the index */
+
+	if (node->is_delete
+	    && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+		dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+		mtr_commit(mtr);
+
+		mtr_start(mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+									mtr);
+		if (!success) {
+			err = DB_ERROR;
+
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	} 
+
+	rec = btr_pcur_get_rec(pcur);
+	offsets = rec_get_offsets(rec, index, offsets_,
+						ULINT_UNDEFINED, &heap);
+
+	if (!node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(0,
+						rec, index, offsets, thr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+			goto exit_func;
+		}
+	}
+
+	/* NOTE: the following function calls will also commit mtr */
+
+	if (node->is_delete) {
+		err = row_upd_del_mark_clust_rec(node, index, thr, check_ref,
+									mtr);
+		if (err == DB_SUCCESS) {
+			node->state = UPD_NODE_UPDATE_ALL_SEC;
+			node->index = dict_table_get_next_index(index);
+		}
+	exit_func:
+		if (heap) {
+			mem_heap_free(heap);
+		}
+		return(err);
+	}
+	
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+ 
+	if (!node->in_mysql_interface) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+		row_upd_copy_columns(rec, offsets,
+					UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+		
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+		err = row_upd_clust_rec(node, index, thr, mtr);
+		return(err);
+	}
+	
+	row_upd_store_row(node);
+
+	if (row_upd_changes_ord_field_binary(node->row, index, node->update)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
+									mtr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_ALL_SEC;
+	} else {
+		err = row_upd_clust_rec(node, index, thr, mtr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_SOME_SEC;
+	}
+
+	node->index = dict_table_get_next_index(index);
+
+	return(err);
+}
+
+/***************************************************************
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor. */
+static
+ulint
+row_upd(
+/*====*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err	= DB_SUCCESS;
+	
+	ut_ad(node && thr);
+
+	if (node->in_mysql_interface) {
+	
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+		
+		if (node->is_delete ||
+			row_upd_changes_some_index_ord_field_binary(
+					node->table, node->update)) {
+			node->cmpl_info = 0; 
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	if (node->state == UPD_NODE_UPDATE_CLUSTERED
+				|| node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+		err = row_upd_clust_step(node, thr);
+		
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+	}
+
+	if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+		goto function_exit;
+	}
+
+	while (node->index != NULL) {
+		err = row_upd_sec_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+        }
+
+function_exit:
+	if (err == DB_SUCCESS) {
+		/* Do some cleanup */
+
+		if (node->row != NULL) {
+			node->row = NULL;
+			node->n_ext_vec = 0;
+			mem_heap_empty(node->heap);
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+	}
+
+        return(err);
+}
+
+/***************************************************************
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_upd_step(
+/*=========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	ulint		err		= DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started(trx);
+
+	node = thr->run_node;
+	
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+	
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+	
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			err = lock_table(0, node->table, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+	
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+		
+			/* Fetch a row to update */
+		
+			thr->run_node = sel_node;
+	
+			return(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+	
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+			
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+	
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+	
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	return(thr);
+} 
+
+/*************************************************************************
+Performs an in-place update for the current clustered index record in
+select. */
+
+void
+row_upd_in_place_in_select(
+/*=======================*/
+	sel_node_t*	sel_node,	/* in: select node */
+	que_thr_t*	thr,		/* in: query thread */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	upd_node_t*	node;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+	ut_ad(sel_node->select_will_do_update);
+	ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF);
+	ut_ad(sel_node->asc);
+
+	node = que_node_get_parent(sel_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Copy the necessary columns from clust_rec and calculate the new
+	values to set */
+
+	row_upd_copy_columns(btr_pcur_get_rec(pcur), rec_get_offsets(
+			btr_pcur_get_rec(pcur), btr_cur->index, offsets_,
+			ULINT_UNDEFINED, &heap),
+		UT_LIST_GET_FIRST(node->columns));
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	row_upd_eval_new_vals(node->update);
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					btr_cur->index->table->comp));
+	
+	ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE);
+	ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+	ut_ad(node->select_will_do_update);
+
+	err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur,
+						node->update, node->cmpl_info,
+						thr, mtr);
+	ut_ad(err == DB_SUCCESS);
+}
diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c
new file mode 100644
index 00000000000..36f6c27636d
--- /dev/null
+++ b/storage/innobase/row/row0vers.c
@@ -0,0 +1,492 @@
+/******************************************************
+Row versions
+
+(c) 1997 Innobase Oy
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "lock0lock.h"
+
+/*********************************************************************
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function! */
+
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+				/* out: NULL if committed, else the active
+				transaction; NOTE that the kernel mutex is
+				temporarily released! */
+	rec_t*		rec,	/* in: record in a secondary index */
+	dict_index_t*	index,	/* in: the secondary index */
+	const ulint*	offsets)/* in: rec_get_offsets(rec, index) */
+{
+	dict_index_t*	clust_index;
+	rec_t*		clust_rec;
+	ulint*		clust_offsets;
+	rec_t*		version;
+	rec_t*		prev_version;
+	dulint		trx_id;
+	dulint		prev_trx_id;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	dtuple_t*	entry	= NULL; /* assignment to eliminate compiler
+					warning */
+	trx_t*		trx;
+	ibool		vers_del;
+	ibool		rec_del;
+	ulint		err;
+	mtr_t		mtr;
+	ibool		comp;
+	
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mutex_exit(&kernel_mutex);
+
+	mtr_start(&mtr);
+	
+	/* Search for the clustered index record: this is a time-consuming
+	operation: therefore we release the kernel mutex; also, the release
+	is required by the latching order convention. The latch on the
+	clustered index locks the top of the stack of versions. We also
+	reserve purge_latch to lock the bottom of the version stack. */	
+
+	clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+							&clust_index, &mtr);
+	if (!clust_rec) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.c
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		mutex_enter(&kernel_mutex);
+		mtr_commit(&mtr);
+
+	        return(NULL);
+	}
+
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+
+	mtr_s_lock(&(purge_sys->latch), &mtr);
+
+	mutex_enter(&kernel_mutex);
+	
+	trx = NULL;
+	if (!trx_is_active(trx_id)) {
+		/* The transaction that modified or inserted clust_rec is no
+		longer active: no implicit lock on rec */
+		goto exit_func;
+	}
+
+	if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
+					clust_offsets, TRUE)) {
+		/* Corruption noticed: try to avoid a crash by returning */
+		goto exit_func;
+	}
+
+	comp = index->table->comp;
+	ut_ad(index->table == clust_index->table);
+	ut_ad(comp == page_is_comp(buf_frame_align(rec)));
+	ut_ad(comp == page_is_comp(buf_frame_align(clust_rec)));
+
+	/* We look up if some earlier version, which was modified by the trx_id
+	transaction, of the clustered index record would require rec to be in
+	a different state (delete marked or unmarked, or have different field
+	values, or not existing). If there is such a version, then rec was
+	modified by the trx_id transaction, and it has an implicit x-lock on
+	rec. Note that if clust_rec itself would require rec to be in a
+	different state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+	rec_del = rec_get_deleted_flag(rec, comp);
+	trx = NULL;
+
+	version = clust_rec;
+
+	for (;;) {
+		mutex_exit(&kernel_mutex);
+
+		/* While we retrieve an earlier version of clust_rec, we
+		release the kernel mutex, because it may take time to access
+		the disk. After the release, we have to check if the trx_id
+		transaction is still active. We keep the semaphore in mtr on
+		the clust_rec page, so that no other transaction can update
+		it and get an implicit x-lock on rec. */
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+					clust_index, clust_offsets, heap,
+					&prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (prev_version) {
+			clust_offsets = rec_get_offsets(prev_version,
+					clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets, heap);
+			entry = row_build_index_entry(row, index, heap);
+		}
+
+		mutex_enter(&kernel_mutex);
+
+		if (!trx_is_active(trx_id)) {
+			/* Transaction no longer active: no implicit x-lock */
+
+			break;
+		}
+
+		/* If the transaction is still active, the previous version
+		of clust_rec must be accessible if not a fresh insert; we
+		may assert the following: */
+
+		ut_ad(err == DB_SUCCESS);
+						
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version: there is an
+			implicit x-lock on rec */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		/* If we get here, we know that the trx_id transaction is
+		still active and it has modified prev_version. Let us check
+		if prev_version would require rec to be in a different
+		state. */
+
+		vers_del = rec_get_deleted_flag(prev_version, comp);
+
+		/* We check if entry and rec are identified in the alphabetical
+		ordering */
+		if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+
+			/* It is possible that the row was updated so that the
+			secondary index record remained the same in
+			alphabetical ordering, but the field values changed
+			still. For example, 'abc' -> 'ABC'. Check also that. */
+
+			dtuple_set_types_binary(entry,
+						dtuple_get_n_fields(entry));
+			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+								clust_offsets);
+
+		if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+			/* The versions modified by the trx_id transaction end
+			to prev_version: no implicit x-lock */
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+exit_func:
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(trx);
+}
+
+/*********************************************************************
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view. */
+
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+			/* out: TRUE if earlier version should be preserved */
+	dulint	trx_id,	/* in: transaction id in the version */
+	mtr_t*	mtr)	/* in: mtr holding the latch on the clustered index
+			record; it will also hold the latch on purge_view */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	if (trx_purge_update_undo_must_exist(trx_id)) {
+
+		/* A purge operation is not yet allowed to remove this
+		delete marked record */
+			
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry and ientry are identified in
+the alphabetical ordering; exactly in this case we return TRUE. */
+
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+				/* out: TRUE if earlier version should have */
+	ibool		also_curr,/* in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	rec_t*		rec,	/* in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/* in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/* in: the secondary index */
+	dtuple_t*	ientry)	/* in: the secondary index entry */
+{
+	rec_t*		version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	dtuple_t*	entry;
+	ulint		err;
+	ibool		comp;
+
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+	   	|| mtr_memo_contains(mtr, buf_block_align(rec),
+						MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = index->table->comp;
+	ut_ad(comp == page_is_comp(buf_frame_align(rec)));
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+
+	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+						rec, clust_offsets, heap);
+		entry = row_build_index_entry(row, index, heap);
+
+ 		/* NOTE that we cannot do the comparison as binary
+		fields because the row is maybe being modified so that
+		the clustered index record has already been updated
+		to a different binary value in a char field, but the
+		collation identifies the old and new value anyway! */
+
+		if (dtuple_datas_are_ordering_equal(ientry, entry)) {
+
+			mem_heap_free(heap);
+
+			return(TRUE);
+		}
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		err = trx_undo_prev_version_build(rec, mtr, version,
+					clust_index, clust_offsets, heap,
+					&prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (err != DB_SUCCESS || !prev_version) {
+			/* Versions end here */
+
+			mem_heap_free(heap);
+
+			return(FALSE);
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL, ULINT_UNDEFINED, &heap);
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets, heap);
+			entry = row_build_index_entry(row, index, heap);
+
+ 			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (dtuple_datas_are_ordering_equal(ientry, entry)) {
+
+				mem_heap_free(heap);
+
+				return(TRUE);
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*********************************************************************
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version. */
+
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+				/* out: DB_SUCCESS or DB_MISSING_HISTORY */
+	rec_t*		rec,	/* in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/* in: mtr holding the latch on rec */
+	dict_index_t*	index,	/* in: the clustered index */
+	ulint**		offsets,/* in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	read_view_t*	view,	/* in: the consistent read view */
+	mem_heap_t**	offset_heap,/* in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/* in: memory heap from which the memory for
+				old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers)/* out, own: old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	rec_t*		version;
+	rec_t*		prev_version;
+	dulint		prev_trx_id;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	ulint		err;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+	   	|| mtr_memo_contains(mtr, buf_block_align(rec),
+						MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	ut_ad(!read_view_sees_trx_id(view,
+				row_get_rec_trx_id(rec, index, *offsets)));
+
+	rw_lock_s_lock(&(purge_sys->latch));
+	version = rec;
+
+	for (;;) {
+		mem_heap_t*	heap2	= heap;
+		heap = mem_heap_create(1024);
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+						*offsets, heap, &prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		*offsets = rec_get_offsets(prev_version, index, *offsets,
+					ULINT_UNDEFINED, offset_heap);
+		prev_trx_id = row_get_rec_trx_id(prev_version, index,
+					*offsets);
+
+		if (read_view_sees_trx_id(view, prev_trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			*old_vers = rec_copy(buf, prev_version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+	mem_heap_free(heap);
+	rw_lock_s_unlock(&(purge_sys->latch));
+
+	return(err);
+}
author	unknown <brian@zim.(none)>	2005-04-26 19:07:13 -0700
committer	unknown <brian@zim.(none)>	2005-04-26 19:07:13 -0700
commit	a9da10f7a8e626aac0bd3a8d82b20c7b864c7061 (patch)
tree	7028a9aade64c6143da00a4301627ce5d262c0f0 /storage/innobase/row
parent	a2ed27af5291f778c1fc6e23cf4edc1bc36f0bed (diff)
parent	25311ea4a5f83652959a0744d99a4eb51aa9d328 (diff)
download	mariadb-git-a9da10f7a8e626aac0bd3a8d82b20c7b864c7061.tar.gz