Renamed storage/innodb_plugin to storage/innobase, so that 1) it's the same

layout as we always had in trees containing only the builtin 2) win\configure.js WITH_INNOBASE_STORAGE_ENGINE still works. storage/innobase/CMakeLists.txt: fix to new directory name (and like 5.1) storage/innobase/Makefile.am: fix to new directory name (and like 5.1) storage/innobase/handler/ha_innodb.cc: fix to new directory name (and like 5.1) storage/innobase/plug.in: fix to new directory name (and like 5.1)
author: Guilhem Bichot <guilhem@mysql.com> 2009-08-07 12:16:00 +0200
committer: Guilhem Bichot <guilhem@mysql.com> 2009-08-07 12:16:00 +0200
commit: 7f2dd1570420a8fe096324db972ae3c775113a33 (patch)
tree: 65b42f5cb11f29ea5b4414ff075ccafd48569ad6 /storage/innobase/row
parent: 10080d547e012aac4feb566fd7b51bbb485a717b (diff)
download: mariadb-git-7f2dd1570420a8fe096324db972ae3c775113a33.tar.gz
12 files changed, 20281 insertions, 0 deletions
diff --git a/storage/innobase/row/row0ext.c b/storage/innobase/row/row0ext.c
new file mode 100644
index 00000000000..7320f5b1dca
--- /dev/null
+++ b/storage/innobase/row/row0ext.c
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.c
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+
+#ifdef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#include "btr0cur.h"
+
+/********************************************************************//**
+Fills the column prefix cache of an externally stored column. */
+static
+void
+row_ext_cache_fill(
+/*===============*/
+	row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint		i,	/*!< in: index of ext->ext[] */
+	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
+	const dfield_t*	dfield)	/*!< in: data field */
+{
+	const byte*	field	= dfield_get_data(dfield);
+	ulint		f_len	= dfield_get_len(dfield);
+	byte*		buf	= ext->buf + i * REC_MAX_INDEX_COL_LEN;
+
+	ut_ad(i < ext->n_ext);
+	ut_ad(dfield_is_ext(dfield));
+	ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+				  field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The BLOB pointer is not set: we cannot fetch it */
+		ext->len[i] = 0;
+	} else {
+		/* Fetch at most REC_MAX_INDEX_COL_LEN of the column.
+		The column should be non-empty.  However,
+		trx_rollback_or_clean_all_recovered() may try to
+		access a half-deleted BLOB if the server previously
+		crashed during the execution of
+		btr_free_externally_stored_field(). */
+		ext->len[i] = btr_copy_externally_stored_field_prefix(
+			buf, REC_MAX_INDEX_COL_LEN, zip_size, field, f_len);
+	}
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return	own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	ulint		i;
+	row_ext_t*	ret = mem_heap_alloc(heap, (sizeof *ret)
+					     + (n_ext - 1) * sizeof ret->len);
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+	ret->n_ext = n_ext;
+	ret->ext = ext;
+	ret->buf = mem_heap_alloc(heap, n_ext * REC_MAX_INDEX_COL_LEN);
+#ifdef UNIV_DEBUG
+	memset(ret->buf, 0xaa, n_ext * REC_MAX_INDEX_COL_LEN);
+	UNIV_MEM_ALLOC(ret->buf, n_ext * REC_MAX_INDEX_COL_LEN);
+#endif
+
+	/* Fetch the BLOB prefixes */
+	for (i = 0; i < n_ext; i++) {
+		const dfield_t*	dfield;
+
+		dfield = dtuple_get_nth_field(tuple, ext[i]);
+		row_ext_cache_fill(ret, i, zip_size, dfield);
+	}
+
+	return(ret);
+}
diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c
new file mode 100644
index 00000000000..930c9ec1fc7
--- /dev/null
+++ b/storage/innobase/row/row0ins.c
@@ -0,0 +1,2508 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.c
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+#include "buf0lru.h"
+
+#define	ROW_INS_PREV	1
+#define	ROW_INS_NEXT	2
+
+
+/*********************************************************************//**
+Creates an insert node struct.
+@return	own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+	ulint		ins_type,	/*!< in: INS_VALUES, ... */
+	dict_table_t*	table,		/*!< in: table where to insert */
+	mem_heap_t*	heap)		/*!< in: mem heap where created */
+{
+	ins_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+	node->common.type = QUE_NODE_INSERT;
+
+	node->ins_type = ins_type;
+
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->table = table;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->select = NULL;
+
+	node->trx_id = ut_dulint_zero;
+
+	node->entry_sys_heap = mem_heap_create(128);
+
+	node->magic_n = INS_NODE_MAGIC_N;
+
+	return(node);
+}
+
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+UNIV_INTERN
+void
+ins_node_create_entry_list(
+/*=======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	dict_index_t*	index;
+	dtuple_t*	entry;
+
+	ut_ad(node->entry_sys_heap);
+
+	UT_LIST_INIT(node->entry_list);
+
+	index = dict_table_get_first_index(node->table);
+
+	while (index != NULL) {
+		entry = row_build_index_entry(node->row, NULL, index,
+					      node->entry_sys_heap);
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+		index = dict_table_get_next_index(index);
+	}
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/*!< in: insert node */
+{
+	dtuple_t*		row;
+	dict_table_t*		table;
+	mem_heap_t*		heap;
+	const dict_col_t*	col;
+	dfield_t*		dfield;
+	byte*			ptr;
+
+	row = node->row;
+	table = node->table;
+	heap = node->entry_sys_heap;
+
+	ut_ad(row && table && heap);
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* 1. Allocate buffer for row id */
+
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN);
+
+	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+	node->row_id_buf = ptr;
+
+	/* 3. Allocate buffer for trx id */
+
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+
+	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+	node->trx_id_buf = ptr;
+
+	/* 4. Allocate buffer for roll ptr */
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+
+	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row)	/*!< in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = ut_dulint_zero;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	big_rec_t*	dummy_big_rec;
+	mem_heap_t*	heap;
+	upd_t*		update;
+	rec_t*		rec;
+	ulint		err;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(!dict_index_is_clust(cursor->index));
+	ut_ad(rec_get_deleted_flag(rec,
+				   dict_table_is_comp(cursor->index->table)));
+
+	/* We know that in the alphabetical ordering, entry and rec are
+	identified. But in their binary form there may be differences if
+	there are char fields in them. Therefore we have to calculate the
+	difference. */
+
+	heap = mem_heap_create(1024);
+
+	update = row_upd_build_sec_rec_difference_binary(
+		cursor->index, entry, rec, thr_get_trx(thr), heap);
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try an optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+						update, 0, thr, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		}
+	} else {
+		ut_a(mode == BTR_MODIFY_TREE);
+		if (buf_LRU_buf_pool_running_out()) {
+
+			err = DB_LOCK_TABLE_FULL;
+
+			goto func_exit;
+		}
+
+		err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+						 &heap, &dummy_big_rec, update,
+						 0, thr, mtr);
+		ut_ad(!dummy_big_rec);
+	}
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return	DB_SUCCESS, DB_FAIL, or error code */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	big_rec_t**	big_rec,/*!< out: possible big rec vector of fields
+				which have to be stored externally by the
+				caller */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	rec_t*		rec;
+	upd_t*		update;
+	ulint		err;
+
+	ut_ad(dict_index_is_clust(cursor->index));
+
+	*big_rec = NULL;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec,
+				   dict_table_is_comp(cursor->index->table)));
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may NOT contain system columns trx_id or
+	roll_ptr */
+
+	update = row_upd_build_difference_binary(cursor->index, entry, rec,
+						 thr_get_trx(thr), *heap);
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+						mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		}
+	} else {
+		ut_a(mode == BTR_MODIFY_TREE);
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+
+		}
+		err = btr_cur_pessimistic_update(0, cursor,
+						 heap, big_rec, update,
+						 0, thr, mtr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return	TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+	que_node_t*	node,	/*!< in: node in a query graph */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	parent;
+	upd_node_t*	upd_node;
+
+	parent = que_node_get_parent(node);
+
+	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+
+		upd_node = parent;
+
+		if (upd_node->table == table && upd_node->is_delete == FALSE) {
+
+			return(TRUE);
+		}
+
+		parent = que_node_get_parent(parent);
+
+		ut_a(parent);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return	number of ancestors */
+static
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+	que_node_t*	node)	/*!< in: node in a query graph */
+{
+	que_node_t*	parent;
+	ulint		n_ancestors = 0;
+
+	parent = que_node_get_parent(node);
+
+	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+		n_ancestors++;
+
+		parent = que_node_get_parent(parent);
+
+		ut_a(parent);
+	}
+
+	return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return number of fields in the calculated update vector; the value
+can also be 0 if no foreign key fields changed; the returned value is
+ULINT_UNDEFINED if the column type in the child table is too short to
+fit the new value in the parent table: that means the update fails */
+static
+ulint
+row_ins_cascade_calc_update_vec(
+/*============================*/
+	upd_node_t*	node,		/*!< in: update node of the parent
+					table */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	mem_heap_t*	heap)		/*!< in: memory heap to use as
+					temporary storage */
+{
+	upd_node_t*	cascade		= node->cascade_node;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index		= foreign->foreign_index;
+	upd_t*		update;
+	upd_field_t*	ufield;
+	dict_table_t*	parent_table;
+	dict_index_t*	parent_index;
+	upd_t*		parent_update;
+	upd_field_t*	parent_ufield;
+	ulint		n_fields_updated;
+	ulint		parent_field_no;
+	ulint		i;
+	ulint		j;
+
+	ut_a(node);
+	ut_a(foreign);
+	ut_a(cascade);
+	ut_a(table);
+	ut_a(index);
+
+	/* Calculate the appropriate update vector which will set the fields
+	in the child index record to the same value (possibly padded with
+	spaces if the column is a fixed length CHAR or FIXBINARY column) as
+	the referenced index record will get in the update. */
+
+	parent_table = node->table;
+	ut_a(parent_table == foreign->referenced_table);
+	parent_index = foreign->referenced_index;
+	parent_update = node->update;
+
+	update = cascade->update;
+
+	update->info_bits = 0;
+	update->n_fields = foreign->n_fields;
+
+	n_fields_updated = 0;
+
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		parent_field_no = dict_table_get_nth_col_pos(
+			parent_table,
+			dict_index_get_nth_col_no(parent_index, i));
+
+		for (j = 0; j < parent_update->n_fields; j++) {
+			parent_ufield = parent_update->fields + j;
+
+			if (parent_ufield->field_no == parent_field_no) {
+
+				ulint			min_size;
+				const dict_col_t*	col;
+				ulint			ufield_len;
+
+				col = dict_index_get_nth_col(index, i);
+
+				/* A field in the parent index record is
+				updated. Let us make the update vector
+				field for the child table. */
+
+				ufield = update->fields + n_fields_updated;
+
+				ufield->field_no
+					= dict_table_get_nth_col_pos(
+					table, dict_col_get_no(col));
+				ufield->exp = NULL;
+
+				ufield->new_val = parent_ufield->new_val;
+				ufield_len = dfield_get_len(&ufield->new_val);
+
+				/* Clear the "external storage" flag */
+				dfield_set_len(&ufield->new_val, ufield_len);
+
+				/* Do not allow a NOT NULL column to be
+				updated as NULL */
+
+				if (dfield_is_null(&ufield->new_val)
+				    && (col->prtype & DATA_NOT_NULL)) {
+
+					return(ULINT_UNDEFINED);
+				}
+
+				/* If the new value would not fit in the
+				column, do not allow the update */
+
+				if (!dfield_is_null(&ufield->new_val)
+				    && dtype_get_at_most_n_mbchars(
+					col->prtype,
+					col->mbminlen, col->mbmaxlen,
+					col->len,
+					ufield_len,
+					dfield_get_data(&ufield->new_val))
+				    < ufield_len) {
+
+					return(ULINT_UNDEFINED);
+				}
+
+				/* If the parent column type has a different
+				length than the child column type, we may
+				need to pad with spaces the new value of the
+				child column */
+
+				min_size = dict_col_get_min_size(col);
+
+				/* Because UNIV_SQL_NULL (the marker
+				of SQL NULL values) exceeds all possible
+				values of min_size, the test below will
+				not hold for SQL NULL columns. */
+
+				if (min_size > ufield_len) {
+
+					char*		pad_start;
+					const char*	pad_end;
+					char*		padded_data
+						= mem_heap_alloc(
+							heap, min_size);
+					pad_start = padded_data + ufield_len;
+					pad_end = padded_data + min_size;
+
+					memcpy(padded_data,
+					       dfield_get_data(&ufield
+							       ->new_val),
+					       dfield_get_len(&ufield
+							      ->new_val));
+
+					switch (UNIV_EXPECT(col->mbminlen,1)) {
+					default:
+						ut_error;
+						return(ULINT_UNDEFINED);
+					case 1:
+						if (UNIV_UNLIKELY
+						    (dtype_get_charset_coll(
+							    col->prtype)
+						     == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+							/* Do not pad BINARY
+							columns. */
+							return(ULINT_UNDEFINED);
+						}
+
+						/* space=0x20 */
+						memset(pad_start, 0x20,
+						       pad_end - pad_start);
+						break;
+					case 2:
+						/* space=0x0020 */
+						ut_a(!(ufield_len % 2));
+						ut_a(!(min_size % 2));
+						do {
+							*pad_start++ = 0x00;
+							*pad_start++ = 0x20;
+						} while (pad_start < pad_end);
+						break;
+					}
+
+					dfield_set_data(&ufield->new_val,
+							padded_data, min_size);
+				}
+
+				n_fields_updated++;
+			}
+		}
+	}
+
+	update->n_fields = n_fields_updated;
+
+	return(n_fields_updated);
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	mutex_enter(&srv_misc_tmpfile_mutex);
+	rewind(srv_misc_tmpfile);
+
+	if (os_file_set_eof(srv_misc_tmpfile)) {
+		ut_print_name(srv_misc_tmpfile, trx, TRUE,
+			      foreign->foreign_table_name);
+		dict_print_info_on_foreign_key_in_create_format(
+			srv_misc_tmpfile, trx, foreign, FALSE);
+		trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+	} else {
+		trx_set_detailed_error(trx, "temp file operation failed");
+	}
+
+	mutex_exit(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+	const char*	errstr,		/*!< in: error string from the viewpoint
+					of the parent table */
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a matching index record in the
+					child table */
+	const dtuple_t*	entry)		/*!< in: index entry in the parent
+					table */
+{
+	FILE*	ef	= dict_foreign_err_file;
+	trx_t*	trx	= thr_get_trx(thr);
+
+	row_ins_set_detailed(trx, foreign);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Transaction:\n", ef);
+	trx_print(ef, trx, 600);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+							TRUE);
+	putc('\n', ef);
+	fputs(errstr, ef);
+	fputs(" in parent table, in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in child table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+	if (rec) {
+		fputs(", there is a record:\n", ef);
+		rec_print(ef, rec, foreign->foreign_index);
+	} else {
+		fputs(", the record is not available\n", ef);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a record in the parent table:
+					it does not match entry because we
+					have an error! */
+	const dtuple_t*	entry)		/*!< in: index entry to insert in the
+					child table */
+{
+	FILE*	ef	= dict_foreign_err_file;
+
+	row_ins_set_detailed(trx, foreign);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Transaction:\n", ef);
+	trx_print(ef, trx, 600);
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+							TRUE);
+	fputs("\nTrying to add in child table, in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		/* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+		It would be better to only display the user columns. */
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in parent table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->referenced_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+	fputs(",\nthe closest match we can find is record:\n", ef);
+	if (rec && page_rec_is_supremum(rec)) {
+		/* If the cursor ended on a supremum record, it is better
+		to report the previous record in the error message, so that
+		the user gets a more descriptive error message. */
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	if (rec) {
+		rec_print(ef, rec, foreign->referenced_index);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	const char*	name)		/*!< in: table name prefixed with
+					database name and a '/' character */
+{
+	char*	buf;
+	char*	ptr;
+	ulint	len = strlen(name) + 1;
+
+	buf = mem_strdupl(name, len);
+
+	ptr = strchr(buf, '/');
+	ut_a(ptr);
+	*ptr = '\0';
+
+	innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
+	mem_free(buf);
+}
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static
+ulint
+row_ins_foreign_check_on_constraint(
+/*================================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	btr_pcur_t*	pcur,		/*!< in: cursor placed on a matching
+					index record in the child table */
+	dtuple_t*	entry,		/*!< in: index entry in the parent
+					table */
+	mtr_t*		mtr)		/*!< in: mtr holding the latch of pcur
+					page */
+{
+	upd_node_t*	node;
+	upd_node_t*	cascade;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	mem_heap_t*	upd_vec_heap	= NULL;
+	const rec_t*	rec;
+	const rec_t*	clust_rec;
+	const buf_block_t* clust_block;
+	upd_t*		update;
+	ulint		n_to_update;
+	ulint		err;
+	ulint		i;
+	trx_t*		trx;
+	mem_heap_t*	tmp_heap	= NULL;
+
+	ut_a(thr);
+	ut_a(foreign);
+	ut_a(pcur);
+	ut_a(mtr);
+
+	trx = thr_get_trx(thr);
+
+	/* Since we are going to delete or update a row, we have to invalidate
+	the MySQL query cache for table. A deadlock of threads is not possible
+	here because the caller of this function does not hold any latches with
+	the sync0sync.h rank above the kernel mutex. The query cache mutex has
+	a rank just above the kernel mutex. */
+
+	row_ins_invalidate_query_cache(thr, table->name);
+
+	node = thr->run_node;
+
+	if (node->is_delete && 0 == (foreign->type
+				     & (DICT_FOREIGN_ON_DELETE_CASCADE
+					| DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+		row_ins_foreign_report_err("Trying to delete",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (!node->is_delete && 0 == (foreign->type
+				      & (DICT_FOREIGN_ON_UPDATE_CASCADE
+					 | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* This is an UPDATE */
+
+		row_ins_foreign_report_err("Trying to update",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (node->cascade_node == NULL) {
+		/* Extend our query graph by creating a child to current
+		update node. The child is used in the cascade or set null
+		operation. */
+
+		node->cascade_heap = mem_heap_create(128);
+		node->cascade_node = row_create_update_node_for_mysql(
+			table, node->cascade_heap);
+		que_node_set_parent(node->cascade_node, node);
+	}
+
+	/* Initialize cascade_node to do the operation we want. Note that we
+	use the SAME cascade node to do all foreign key operations of the
+	SQL DELETE: the table of the cascade node may change if there are
+	several child tables to the table where the delete is done! */
+
+	cascade = node->cascade_node;
+
+	cascade->table = table;
+
+	cascade->foreign = foreign;
+
+	if (node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+		cascade->is_delete = TRUE;
+	} else {
+		cascade->is_delete = FALSE;
+
+		if (foreign->n_fields > cascade->update_n_fields) {
+			/* We have to make the update vector longer */
+
+			cascade->update = upd_create(foreign->n_fields,
+						     node->cascade_heap);
+			cascade->update_n_fields = foreign->n_fields;
+		}
+	}
+
+	/* We do not allow cyclic cascaded updating (DELETE is allowed,
+	but not UPDATE) of the same table, as this can lead to an infinite
+	cycle. Check that we are not updating the same table which is
+	already being modified in this cascade chain. We have to check
+	this also because the modification of the indexes of a 'parent'
+	table may still be incomplete, and we must avoid seeing the indexes
+	of the parent table in an inconsistent state! */
+
+	if (!cascade->is_delete
+	    && row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+		/* We do not know if this would break foreign key
+		constraints, but play safe and return an error */
+
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+			"Trying an update, possibly causing a cyclic"
+			" cascaded update\n"
+			"in the child table,", thr, foreign,
+			btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	if (row_ins_cascade_n_ancestors(cascade) >= 15) {
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+			"Trying a too deep cascaded delete or update\n",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	index = btr_pcur_get_btr_cur(pcur)->index;
+
+	ut_a(index == foreign->foreign_index);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (dict_index_is_clust(index)) {
+		/* pcur is already positioned in the clustered index of
+		the child table */
+
+		clust_index = index;
+		clust_rec = rec;
+		clust_block = btr_pcur_get_block(pcur);
+	} else {
+		/* We have to look for the record in the clustered index
+		in the child table */
+
+		clust_index = dict_table_get_first_index(table);
+
+		tmp_heap = mem_heap_create(256);
+
+		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+					tmp_heap);
+		btr_pcur_open_with_no_init(clust_index, ref,
+					   PAGE_CUR_LE, BTR_SEARCH_LEAF,
+					   cascade->pcur, 0, mtr);
+
+		clust_rec = btr_pcur_get_rec(cascade->pcur);
+		clust_block = btr_pcur_get_block(cascade->pcur);
+
+		if (!page_rec_is_user_rec(clust_rec)
+		    || btr_pcur_get_low_match(cascade->pcur)
+		    < dict_index_get_n_unique(clust_index)) {
+
+			fputs("InnoDB: error in cascade of a foreign key op\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print(stderr, rec, index);
+			fputs("\n"
+			      "InnoDB: clustered record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report to"
+			      " http://bugs.mysql.com\n", stderr);
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Set an X-lock on the row to delete or update in the child table */
+
+	err = lock_table(0, table, LOCK_IX, thr);
+
+	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+
+		err = lock_clust_rec_read_check_and_lock_alt(
+			0, clust_block, clust_rec, clust_index,
+			LOCK_X, LOCK_REC_NOT_GAP, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		goto nonstandard_exit_func;
+	}
+
+	if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+		/* This can happen if there is a circular reference of
+		rows such that cascading delete comes to delete a row
+		already in the process of being delete marked */
+		err = DB_SUCCESS;
+
+		goto nonstandard_exit_func;
+	}
+
+	if ((node->is_delete
+	     && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL))
+	    || (!node->is_delete
+		&& (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* Build the appropriate update vector which sets
+		foreign->n_fields first fields in rec to SQL NULL */
+
+		update = cascade->update;
+
+		update->info_bits = 0;
+		update->n_fields = foreign->n_fields;
+
+		for (i = 0; i < foreign->n_fields; i++) {
+			upd_field_t*	ufield = &update->fields[i];
+
+			ufield->field_no = dict_table_get_nth_col_pos(
+				table,
+				dict_index_get_nth_col_no(index, i));
+			ufield->orig_len = 0;
+			ufield->exp = NULL;
+			dfield_set_null(&ufield->new_val);
+		}
+	}
+
+	if (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+		/* Build the appropriate update vector which sets changing
+		foreign->n_fields first fields in rec to new values */
+
+		upd_vec_heap = mem_heap_create(256);
+
+		n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
+							      upd_vec_heap);
+		if (n_to_update == ULINT_UNDEFINED) {
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying a cascaded update where the"
+				" updated value in the child\n"
+				"table would not fit in the length"
+				" of the column, or the value would\n"
+				"be NULL and the column is"
+				" declared as not NULL in the child table,",
+				thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		}
+
+		if (cascade->update->n_fields == 0) {
+
+			/* The update does not change any columns referred
+			to in this foreign key constraint: no need to do
+			anything */
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Store pcur position and initialize or store the cascade node
+	pcur stored position */
+
+	btr_pcur_store_position(pcur, mtr);
+
+	if (index == clust_index) {
+		btr_pcur_copy_stored_position(cascade->pcur, pcur);
+	} else {
+		btr_pcur_store_position(cascade->pcur, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	err = row_update_cascade_for_mysql(thr, cascade,
+					   foreign->foreign_table);
+
+	if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
+		fprintf(stderr,
+			"InnoDB: error: table %s has the counter 0"
+			" though there is\n"
+			"InnoDB: a FOREIGN KEY check running on it.\n",
+			foreign->foreign_table->name);
+	}
+
+	/* Release the data dictionary latch for a while, so that we do not
+	starve other threads from doing CREATE TABLE etc. if we have a huge
+	cascaded operation running. The counter n_foreign_key_checks_running
+	will prevent other users from dropping or ALTERing the table when we
+	release the latch. */
+
+	row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+	row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+	mtr_start(mtr);
+
+	/* Restore pcur position */
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+		mem_heap_free(upd_vec_heap);
+	}
+
+	return(err);
+
+nonstandard_exit_func:
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+		mem_heap_free(upd_vec_heap);
+	}
+
+	btr_pcur_store_position(pcur, mtr);
+
+	mtr_commit(mtr);
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_ins_set_shared_rec_lock(
+/*========================*/
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_operation_lock.
+@return	DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE if we want to check that
+				the referenced table is ok, FALSE if we
+				want to to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	upd_node;
+	dict_table_t*	check_table;
+	dict_index_t*	check_index;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	ibool		moved;
+	int		cmp;
+	ulint		err;
+	ulint		i;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+run_again:
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	err = DB_SUCCESS;
+
+	if (trx->check_foreigns == FALSE) {
+		/* The user has suppressed foreign key checks currently for
+		this session */
+		goto exit_func;
+	}
+
+	/* If any of the foreign key fields in entry is SQL NULL, we
+	suppress the foreign key check: this is compatible with Oracle,
+	for example */
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+			    dtuple_get_nth_field(entry, i))) {
+
+			goto exit_func;
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+		upd_node = thr->run_node;
+
+		if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
+			/* If a cascaded update is done as defined by a
+			foreign key constraint, do not check that
+			constraint for the child row. In ON UPDATE CASCADE
+			the update of the parent row is only half done when
+			we come here: if we would check the constraint here
+			for the child row it would fail.
+
+			A QUESTION remains: if in the child table there are
+			several constraints which refer to the same parent
+			table, we should merge all updates to the child as
+			one update? And the updates can be contradictory!
+			Currently we just perform the update associated
+			with each foreign key constraint, one after
+			another, and the user has problems predicting in
+			which order they are performed. */
+
+			goto exit_func;
+		}
+	}
+
+	if (check_ref) {
+		check_table = foreign->referenced_table;
+		check_index = foreign->referenced_index;
+	} else {
+		check_table = foreign->foreign_table;
+		check_index = foreign->foreign_index;
+	}
+
+	if (check_table == NULL || check_table->ibd_file_missing) {
+		if (check_ref) {
+			FILE*	ef = dict_foreign_err_file;
+
+			row_ins_set_detailed(trx, foreign);
+
+			mutex_enter(&dict_foreign_err_mutex);
+			rewind(ef);
+			ut_print_timestamp(ef);
+			fputs(" Transaction:\n", ef);
+			trx_print(ef, trx, 600);
+			fputs("Foreign key constraint fails for table ", ef);
+			ut_print_name(ef, trx, TRUE,
+				      foreign->foreign_table_name);
+			fputs(":\n", ef);
+			dict_print_info_on_foreign_key_in_create_format(
+				ef, trx, foreign, TRUE);
+			fputs("\nTrying to add to index ", ef);
+			ut_print_name(ef, trx, FALSE,
+				      foreign->foreign_index->name);
+			fputs(" tuple:\n", ef);
+			dtuple_print(ef, entry);
+			fputs("\nBut the parent table ", ef);
+			ut_print_name(ef, trx, TRUE,
+				      foreign->referenced_table_name);
+			fputs("\nor its .ibd file does"
+			      " not currently exist!\n", ef);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			err = DB_NO_REFERENCED_ROW;
+		}
+
+		goto exit_func;
+	}
+
+	ut_a(check_table);
+	ut_a(check_index);
+
+	if (check_table != table) {
+		/* We already have a LOCK_IX on table, but not necessarily
+		on check_table */
+
+		err = lock_table(0, check_table, LOCK_IS, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto do_possible_lock_wait;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+		      BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	/* Scan index records and check if there is a matching record */
+
+	for (;;) {
+		const rec_t*		rec = btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block = btr_pcur_get_block(&pcur);
+
+		if (page_rec_is_infimum(rec)) {
+
+			goto next_rec;
+		}
+
+		offsets = rec_get_offsets(rec, check_index,
+					  offsets, ULINT_UNDEFINED, &heap);
+
+		if (page_rec_is_supremum(rec)) {
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+							  rec, check_index,
+							  offsets, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
+			goto next_rec;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (rec_get_deleted_flag(rec,
+						 rec_offs_comp(offsets))) {
+				err = row_ins_set_shared_rec_lock(
+					LOCK_ORDINARY, block,
+					rec, check_index, offsets, thr);
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+			} else {
+				/* Found a matching record. Lock only
+				a record because we can allow inserts
+				into gaps */
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP, block,
+					rec, check_index, offsets, thr);
+
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+
+				if (check_ref) {
+					err = DB_SUCCESS;
+
+					break;
+				} else if (foreign->type != 0) {
+					/* There is an ON UPDATE or ON DELETE
+					condition: check them in a separate
+					function */
+
+					err = row_ins_foreign_check_on_constraint(
+						thr, foreign, &pcur, entry,
+						&mtr);
+					if (err != DB_SUCCESS) {
+						/* Since reporting a plain
+						"duplicate key" error
+						message to the user in
+						cases where a long CASCADE
+						operation would lead to a
+						duplicate key in some
+						other table is very
+						confusing, map duplicate
+						key errors resulting from
+						FK constraints to a
+						separate error code. */
+
+						if (err == DB_DUPLICATE_KEY) {
+							err = DB_FOREIGN_DUPLICATE_KEY;
+						}
+
+						break;
+					}
+
+					/* row_ins_foreign_check_on_constraint
+					may have repositioned pcur on a
+					different block */
+					block = btr_pcur_get_block(&pcur);
+				} else {
+					row_ins_foreign_report_err(
+						"Trying to delete or update",
+						thr, foreign, rec, entry);
+
+					err = DB_ROW_IS_REFERENCED;
+					break;
+				}
+			}
+		}
+
+		if (cmp < 0) {
+			err = row_ins_set_shared_rec_lock(
+				LOCK_GAP, block,
+				rec, check_index, offsets, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
+			if (check_ref) {
+				err = DB_NO_REFERENCED_ROW;
+				row_ins_foreign_report_add_err(
+					trx, foreign, rec, entry);
+			} else {
+				err = DB_SUCCESS;
+			}
+
+			break;
+		}
+
+		ut_a(cmp == 0);
+next_rec:
+		moved = btr_pcur_move_to_next(&pcur, &mtr);
+
+		if (!moved) {
+			if (check_ref) {
+				rec = btr_pcur_get_rec(&pcur);
+				row_ins_foreign_report_add_err(
+					trx, foreign, rec, entry);
+				err = DB_NO_REFERENCED_ROW;
+			} else {
+				err = DB_SUCCESS;
+			}
+
+			break;
+		}
+	}
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		trx->error_state = err;
+
+		que_thr_stop_for_mysql(thr);
+
+		srv_suspend_mysql_thread(thr);
+
+		if (trx->error_state == DB_SUCCESS) {
+
+			goto run_again;
+		}
+
+		err = trx->error_state;
+	}
+
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_ins_check_foreign_constraints(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_foreign_t*	foreign;
+	ulint		err;
+	trx_t*		trx;
+	ibool		got_s_lock	= FALSE;
+
+	trx = thr_get_trx(thr);
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign) {
+		if (foreign->foreign_index == index) {
+
+			if (foreign->referenced_table == NULL) {
+				dict_table_get(foreign->referenced_table_name,
+					       FALSE);
+			}
+
+			if (0 == trx->dict_operation_lock_mode) {
+				got_s_lock = TRUE;
+
+				row_mysql_freeze_data_dictionary(trx);
+			}
+
+			if (foreign->referenced_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				(foreign->referenced_table
+				 ->n_foreign_key_checks_running)++;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects the referenced
+			table from being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, entry, thr);
+
+			if (foreign->referenced_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				ut_a(foreign->referenced_table
+				     ->n_foreign_key_checks_running > 0);
+				(foreign->referenced_table
+				 ->n_foreign_key_checks_running)--;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			if (got_s_lock) {
+				row_mysql_unfreeze_data_dictionary(trx);
+			}
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return	TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+	const rec_t*	rec,	/*!< in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, offsets,
+				  &matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+		return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!dict_index_is_clust(index)) {
+
+		for (i = 0; i < n_unique; i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+				    dtuple_get_nth_field(entry, i))) {
+
+				return(FALSE);
+			}
+		}
+	}
+
+	return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return	DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+	dict_index_t*	index,	/*!< in: non-clustered unique index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		n_unique;
+	ulint		i;
+	int		cmp;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	ulint		err		= DB_SUCCESS;
+	unsigned	allow_duplicates;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	n_unique = dict_index_get_n_unique(index);
+
+	/* If the secondary index is unique, but one of the fields in the
+	n_unique first fields is NULL, a unique key violation cannot occur,
+	since we define NULL != NULL in this case */
+
+	for (i = 0; i < n_unique; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+			    dtuple_get_nth_field(entry, i))) {
+
+			return(DB_SUCCESS);
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+
+	btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	allow_duplicates = thr_get_trx(thr)->duplicates & TRX_DUP_IGNORE;
+
+	/* Scan index records and check if there is a duplicate */
+
+	do {
+		const rec_t*		rec	= btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block	= btr_pcur_get_block(&pcur);
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		if (allow_duplicates) {
+
+			/* If the SQL-query will update or replace
+			duplicate key we will take X-lock for
+			duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+			INSERT ON DUPLICATE KEY UPDATE). */
+
+			err = row_ins_set_exclusive_rec_lock(
+				LOCK_ORDINARY, block,
+				rec, index, offsets, thr);
+		} else {
+
+			err = row_ins_set_shared_rec_lock(
+				LOCK_ORDINARY, block,
+				rec, index, offsets, thr);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			break;
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			continue;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							index, offsets)) {
+				err = DB_DUPLICATE_KEY;
+
+				thr_get_trx(thr)->error_info = index;
+
+				break;
+			}
+		}
+
+		if (cmp < 0) {
+			break;
+		}
+
+		ut_a(cmp == 0);
+	} while (btr_pcur_move_to_next(&pcur, &mtr));
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error,
+DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static
+ulint
+row_ins_duplicate_error_in_clust(
+/*=============================*/
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	dtuple_t*	entry,	/*!< in: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	err;
+	rec_t*	rec;
+	ulint	n_unique;
+	trx_t*	trx		= thr_get_trx(thr);
+	mem_heap_t*heap		= NULL;
+	ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	UT_NOT_USED(mtr);
+
+	ut_a(dict_index_is_clust(cursor->index));
+	ut_ad(dict_index_is_unique(cursor->index));
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index);
+
+	if (cursor->low_match >= n_unique) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			/* We set a lock on the possible duplicate: this
+			is needed in logical logging of MySQL to make
+			sure that in roll-forward we get the same duplicate
+			errors as in original execution */
+
+			if (trx->duplicates & TRX_DUP_IGNORE) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor), rec,
+					cursor->index, offsets, thr);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index, offsets)) {
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+		}
+	}
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+		if (!page_rec_is_supremum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			if (trx->duplicates & TRX_DUP_IGNORE) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index, offsets)) {
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+		}
+
+		ut_a(!dict_index_is_clust(cursor->index));
+		/* This should never happen */
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal.
+@return 0 if no update, ROW_INS_PREV if previous should be updated;
+currently we do the search so that only the low_match record can match
+enough to the search tuple, not the next record */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+{
+	ulint	enough_match;
+	rec_t*	rec;
+
+	/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+	pointers on upper levels of the B-tree may match more to entry than
+	to actual user records on the leaf level, we have to check if the
+	candidate record is actually a user record. In a clustered index
+	node pointers contain index->n_unique first fields, and in the case
+	of a secondary index, all fields of the index. */
+
+	enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+
+	if (cursor->low_match >= enough_match) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+
+			return(ROW_INS_PREV);
+		}
+	}
+
+	return(0);
+}
+
+/***************************************************************//**
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed,
+or error code */
+static
+ulint
+row_ins_index_entry_low(
+/*====================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_cur_t	cursor;
+	ulint		ignore_sec_unique	= 0;
+	ulint		modify = 0; /* remove warning */
+	rec_t*		insert_rec;
+	rec_t*		rec;
+	ulint		err;
+	ulint		n_unique;
+	big_rec_t*	big_rec			= NULL;
+	mtr_t		mtr;
+	mem_heap_t*	heap			= NULL;
+
+	log_free_check();
+
+	mtr_start(&mtr);
+
+	cursor.thr = thr;
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	if (!(thr_get_trx(thr)->check_unique_secondary)) {
+		ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE;
+	}
+
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    mode | BTR_INSERT | ignore_sec_unique,
+				    &cursor, 0, &mtr);
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		/* The insertion was made to the insert buffer already during
+		the search: we are done */
+
+		err = DB_SUCCESS;
+
+		goto function_exit;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_get_n_fields(first_rec, index)
+		      == dtuple_get_n_fields(entry));
+	}
+#endif
+
+	n_unique = dict_index_get_n_unique(index);
+
+	if (dict_index_is_unique(index) && (cursor.up_match >= n_unique
+					    || cursor.low_match >= n_unique)) {
+
+		if (dict_index_is_clust(index)) {
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(
+				&cursor, entry, thr, &mtr);
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+		} else {
+			mtr_commit(&mtr);
+			err = row_ins_scan_sec_index_for_duplicate(
+				index, entry, thr);
+			mtr_start(&mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			/* We did not find a duplicate and we have now
+			locked with s-locks the necessary records to
+			prevent any insertion of a duplicate by another
+			transaction. Let us now reposition the cursor and
+			continue the insertion. */
+
+			btr_cur_search_to_nth_level(index, 0, entry,
+						    PAGE_CUR_LE,
+						    mode | BTR_INSERT,
+						    &cursor, 0, &mtr);
+		}
+	}
+
+	modify = row_ins_must_modify(&cursor);
+
+	if (modify != 0) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+
+		if (modify == ROW_INS_NEXT) {
+			rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+			btr_cur_position(index, rec,
+					 btr_cur_get_block(&cursor),&cursor);
+		}
+
+		if (dict_index_is_clust(index)) {
+			err = row_ins_clust_index_entry_by_modify(
+				mode, &cursor, &heap, &big_rec, entry,
+				thr, &mtr);
+		} else {
+			ut_ad(!n_ext);
+			err = row_ins_sec_index_entry_by_modify(
+				mode, &cursor, entry, thr, &mtr);
+		}
+	} else {
+		if (mode == BTR_MODIFY_LEAF) {
+			err = btr_cur_optimistic_insert(
+				0, &cursor, entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		} else {
+			ut_a(mode == BTR_MODIFY_TREE);
+			if (buf_LRU_buf_pool_running_out()) {
+
+				err = DB_LOCK_TABLE_FULL;
+
+				goto function_exit;
+			}
+			err = btr_cur_pessimistic_insert(
+				0, &cursor, entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		}
+	}
+
+function_exit:
+	mtr_commit(&mtr);
+
+	if (UNIV_LIKELY_NULL(big_rec)) {
+		rec_t*	rec;
+		ulint*	offsets;
+		mtr_start(&mtr);
+
+		btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+					    BTR_MODIFY_TREE, &cursor, 0, &mtr);
+		rec = btr_cur_get_rec(&cursor);
+		offsets = rec_get_offsets(rec, index, NULL,
+					  ULINT_UNDEFINED, &heap);
+
+		err = btr_store_big_rec_extern_fields(
+			index, btr_cur_get_block(&cursor),
+			rec, offsets, big_rec, &mtr);
+
+		if (modify) {
+			dtuple_big_rec_free(big_rec);
+		} else {
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+ulint
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	ibool		foreign,/*!< in: TRUE=check foreign key constraints */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) {
+		err = row_ins_check_foreign_constraints(index->table, index,
+							entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
+				      n_ext, thr);
+	if (err != DB_FAIL) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
+				      n_ext, thr);
+	return(err);
+}
+
+/***********************************************************//**
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+static
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to make */
+	const dtuple_t*	row)	/*!< in: row */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(entry && row);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields; i++) {
+		dict_field_t*	ind_field;
+		dfield_t*	field;
+		const dfield_t*	row_field;
+		ulint		len;
+
+		field = dtuple_get_nth_field(entry, i);
+		ind_field = dict_index_get_nth_field(index, i);
+		row_field = dtuple_get_nth_field(row, ind_field->col->ind);
+		len = dfield_get_len(row_field);
+
+		/* Check column prefix indexes */
+		if (ind_field->prefix_len > 0
+		    && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+			const	dict_col_t*	col
+				= dict_field_get_col(ind_field);
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminlen, col->mbmaxlen,
+				ind_field->prefix_len,
+				len, dfield_get_data(row_field));
+
+			ut_ad(!dfield_is_ext(row_field));
+		}
+
+		dfield_set_data(field, dfield_get_data(row_field), len);
+		if (dfield_is_ext(row_field)) {
+			ut_ad(dict_index_is_clust(index));
+			dfield_set_ext(field);
+		}
+	}
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(dtuple_check_typed(node->row));
+
+	row_ins_index_entry_set_vals(node->index, node->entry, node->row);
+
+	ut_ad(dtuple_check_typed(node->entry));
+
+	err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	dulint	row_id;
+
+	ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+	if (dict_index_is_unique(dict_table_get_first_index(node->table))) {
+
+		/* No row id is stored if the clustered index is unique */
+
+		return;
+	}
+
+	/* Fill in row id value to row */
+
+	row_id = dict_sys_get_new_row_id();
+
+	dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_ins(
+/*====*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(node && thr);
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (node->index != NULL) {
+		err = row_ins_index_entry_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+	}
+
+	ut_ad(node->entry == NULL);
+
+	node->state = INS_NODE_ALLOC_ROW_ID;
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	ulint		err;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started(trx);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. MySQL's
+	partitioned table code may also call an insert within the same
+	SQL statement AFTER it has used this table handle to do a search.
+	This happens, for example, when a row update moves it to another
+	partition. In that case, we have already set the IX lock on the
+	table during the search operation, and there is no need to set
+	it again here. But we must write trx->id to node->trx_id_buf. */
+
+	trx_write_trx_id(node->trx_id_buf, trx->id);
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+			/* No need to do IX-locking */
+
+			goto same_trx;
+		}
+
+		err = lock_table(0, node->table, LOCK_IX, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+same_trx:
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to insert */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+	    && (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* err == DB_LOCK_WAIT or SQL error detected */
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c
new file mode 100644
index 00000000000..05a45dc647c
--- /dev/null
+++ b/storage/innobase/row/row0merge.c
@@ -0,0 +1,2364 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.c
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "os0file.h"
+#include "lock0lock.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "log0log.h"
+#include "ut0sort.h"
+#include "handler0alter.h"
+
+#ifdef UNIV_DEBUG
+/** Set these in order ot enable debug printout. */
+/* @{ */
+static ibool	row_merge_print_cmp;
+static ibool	row_merge_print_read;
+static ibool	row_merge_print_write;
+/* @} */
+#endif /* UNIV_DEBUG */
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
+ut_ad(data_size < sizeof(row_merge_block_t)). */
+typedef byte	row_merge_block_t[1048576];
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_struct {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	const dfield_t**tuples;		/*!< array of pointers to
+					arrays of fields that form
+					the data tuples */
+	const dfield_t**tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Buffer for sorting in main memory. */
+typedef struct row_merge_buf_struct row_merge_buf_t;
+
+/** Information about temporary files used in merge sort */
+struct merge_file_struct {
+	int	fd;		/*!< file descriptor */
+	ulint	offset;		/*!< file offset */
+};
+
+/** Information about temporary files used in merge sort */
+typedef struct merge_file_struct merge_file_t;
+
+#ifdef UNIV_DEBUG
+/******************************************************//**
+Display a merge tuple. */
+static
+void
+row_merge_tuple_print(
+/*==================*/
+	FILE*		f,	/*!< in: output stream */
+	const dfield_t*	entry,	/*!< in: tuple to print */
+	ulint		n_fields)/*!< in: number of fields in the tuple */
+{
+	ulint	j;
+
+	for (j = 0; j < n_fields; j++) {
+		const dfield_t*	field = &entry[j];
+
+		if (dfield_is_null(field)) {
+			fputs("\n NULL;", f);
+		} else {
+			ulint	field_len	= dfield_get_len(field);
+			ulint	len		= ut_min(field_len, 20);
+			if (dfield_is_ext(field)) {
+				fputs("\nE", f);
+			} else {
+				fputs("\n ", f);
+			}
+			ut_print_buf(f, dfield_get_data(field), len);
+			if (len != field_len) {
+				fprintf(f, " (total %lu bytes)", field_len);
+			}
+		}
+	}
+	putc('\n', f);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_create_low(
+/*=====================*/
+	mem_heap_t*	heap,		/*!< in: heap where allocated */
+	dict_index_t*	index,		/*!< in: secondary index */
+	ulint		max_tuples,	/*!< in: maximum number of data tuples */
+	ulint		buf_size)	/*!< in: size of the buffer, in bytes */
+{
+	row_merge_buf_t*	buf;
+
+	ut_ad(max_tuples > 0);
+	ut_ad(max_tuples <= sizeof(row_merge_block_t));
+	ut_ad(max_tuples < buf_size);
+
+	buf = mem_heap_zalloc(heap, buf_size);
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = mem_heap_alloc(heap,
+				     2 * max_tuples * sizeof *buf->tuples);
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	row_merge_buf_t*	buf;
+	ulint			max_tuples;
+	ulint			buf_size;
+	mem_heap_t*		heap;
+
+	max_tuples = sizeof(row_merge_block_t)
+		/ ut_max(1, dict_index_get_min_size(index));
+
+	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+
+	heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
+
+	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
+
+	return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return	sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+{
+	ulint		buf_size;
+	ulint		max_tuples	= buf->max_tuples;
+	mem_heap_t*	heap		= buf->heap;
+	dict_index_t*	index		= buf->index;
+
+	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+
+	mem_heap_empty(heap);
+
+	return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+static
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer, to be freed */
+{
+	mem_heap_free(buf->heap);
+}
+
+/******************************************************//**
+Insert a data tuple into a sort buffer.
+@return	TRUE if added, FALSE if out of space */
+static
+ibool
+row_merge_buf_add(
+/*==============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	const dtuple_t*		row,	/*!< in: row in clustered index */
+	const row_ext_t*	ext)	/*!< in: cache of externally stored
+					column prefixes, or NULL */
+{
+	ulint			i;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	const dict_index_t*	index;
+	dfield_t*		entry;
+	dfield_t*		field;
+
+	if (buf->n_tuples >= buf->max_tuples) {
+		return(FALSE);
+	}
+
+	UNIV_PREFETCH_R(row->fields);
+
+	index = buf->index;
+
+	n_fields = dict_index_get_n_fields(index);
+
+	entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
+	buf->tuples[buf->n_tuples] = entry;
+	field = entry;
+
+	data_size = 0;
+	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
+
+	for (i = 0; i < n_fields; i++, field++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			col_no;
+		const dfield_t*		row_field;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(index, i);
+		col = ifield->col;
+		col_no = dict_col_get_no(col);
+		row_field = dtuple_get_nth_field(row, col_no);
+		dfield_copy(field, row_field);
+		len = dfield_get_len(field);
+
+		if (dfield_is_null(field)) {
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		} else if (UNIV_LIKELY(!ext)) {
+		} else if (dict_index_is_clust(index)) {
+			/* Flag externally stored fields. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				if (i < dict_index_get_n_unique(index)) {
+					dfield_set_data(field, buf, len);
+				} else {
+					dfield_set_ext(field);
+					len = dfield_get_len(field);
+				}
+			}
+		} else {
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				dfield_set_data(field, buf, len);
+			}
+		}
+
+		/* If a column prefix index, take only the prefix */
+
+		if (ifield->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype,
+				col->mbminlen, col->mbmaxlen,
+				ifield->prefix_len,
+				len, dfield_get_data(field));
+			dfield_set_len(field, len);
+		}
+
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+		if (ifield->fixed_len) {
+			ut_ad(len == ifield->fixed_len);
+			ut_ad(!dfield_is_ext(field));
+		} else if (dfield_is_ext(field)) {
+			extra_size += 2;
+		} else if (len < 128
+			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		ulint	size;
+		ulint	extra;
+
+		size = rec_get_converted_size_comp(index,
+						   REC_STATUS_ORDINARY,
+						   entry, n_fields, &extra);
+
+		ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
+		ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Add to the total size of the record in row_merge_block_t
+	the encoded length of extra_size and the extra bytes (extra_size).
+	See row_merge_buf_write() for the variable-length encoding
+	of extra_size. */
+	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+	/* The following assertion may fail if row_merge_block_t is
+	declared very small and a PRIMARY KEY is being created with
+	many prefix columns.  In that case, the record may exceed the
+	page_zip_rec_needs_ext() limit.  However, no further columns
+	will be moved to external storage until the record is inserted
+	to the clustered index B-tree. */
+	ut_ad(data_size < sizeof(row_merge_block_t));
+
+	/* Reserve one byte for the end marker of row_merge_block_t. */
+	if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
+		return(FALSE);
+	}
+
+	buf->total_size += data_size;
+	buf->n_tuples++;
+
+	field = entry;
+
+	/* Copy the data fields. */
+
+	do {
+		dfield_dup(field++, buf->heap);
+	} while (--n_fields);
+
+	return(TRUE);
+}
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_struct {
+	const dict_index_t*	index;		/*!< index being sorted */
+	TABLE*			table;		/*!< MySQL table object */
+	ulint			n_dup;		/*!< number of duplicates */
+};
+
+/** Structure for reporting duplicate records. */
+typedef struct row_merge_dup_struct row_merge_dup_t;
+
+/*************************************************************//**
+Report a duplicate key. */
+static
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+{
+	mrec_buf_t 		buf;
+	const dtuple_t*		tuple;
+	dtuple_t		tuple_store;
+	const rec_t*		rec;
+	const dict_index_t*	index	= dup->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	mem_heap_t*		heap	= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*			offsets;
+	ulint			n_ext;
+
+	if (dup->n_dup++) {
+		/* Only report the first duplicate record,
+		but count all duplicate records. */
+		return;
+	}
+
+	rec_offs_init(offsets_);
+
+	/* Convert the tuple to a record and then to MySQL format. */
+
+	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
+	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
+
+	rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
+	offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
+				  &heap);
+
+	innobase_rec_to_mysql(dup->table, rec, index, offsets);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+static
+int
+row_merge_tuple_cmp(
+/*================*/
+	ulint			n_field,/*!< in: number of fields */
+	const dfield_t*		a,	/*!< in: first tuple to be compared */
+	const dfield_t*		b,	/*!< in: second tuple to be compared */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates */
+{
+	int		cmp;
+	const dfield_t*	field	= a;
+
+	/* Compare the fields of the tuples until a difference is
+	found or we run out of fields to compare.  If !cmp at the
+	end, the tuples are equal. */
+	do {
+		cmp = cmp_dfield_dfield(a++, b++);
+	} while (!cmp && --n_field);
+
+	if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
+		/* Report a duplicate value error if the tuples are
+		logically equal.  NULL columns are logically inequal,
+		although they are equal in the sorting order.  Find
+		out if any of the fields are NULL. */
+		for (b = field; b != a; b++) {
+			if (dfield_is_null(b)) {
+
+				goto func_exit;
+			}
+		}
+
+		row_merge_dup_report(dup, field);
+	}
+
+func_exit:
+	return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a	array of tuples that being sorted
+@param b	aux (work area), same size as tuples[]
+@param c	lower bound of the sorting area, inclusive
+@param d	upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(a,b,c,d) \
+	row_merge_tuple_sort(n_field, dup, a, b, c, d)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a	first tuple to be compared
+@param b	second tuple to be compared
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+	ulint			n_field,/*!< in: number of fields */
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t**	tuples,	/*!< in/out: tuples */
+	const dfield_t**	aux,	/*!< in/out: work area */
+	ulint			low,	/*!< in: lower bound of the
+					sorting area, inclusive */
+	ulint			high)	/*!< in: upper bound of the
+					sorting area, exclusive */
+{
+	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+static
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates */
+{
+	row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
+			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/******************************************************//**
+Write a buffer to a block. */
+static
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+#ifdef UNIV_DEBUG
+	const merge_file_t*	of,	/*!< in: output file */
+#endif /* UNIV_DEBUG */
+	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
+#ifndef UNIV_DEBUG
+# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
+#endif /* !UNIV_DEBUG */
+{
+	const dict_index_t*	index	= buf->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	byte*			b	= &(*block)[0];
+
+	ulint		i;
+
+	for (i = 0; i < buf->n_tuples; i++) {
+		ulint		size;
+		ulint		extra_size;
+		const dfield_t*	entry		= buf->tuples[i];
+
+		size = rec_get_converted_size_comp(index,
+						   REC_STATUS_ORDINARY,
+						   entry, n_fields,
+						   &extra_size);
+		ut_ad(size > extra_size);
+		ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
+		extra_size -= REC_N_NEW_EXTRA_BYTES;
+		size -= REC_N_NEW_EXTRA_BYTES;
+
+		/* Encode extra_size + 1 */
+		if (extra_size + 1 < 0x80) {
+			*b++ = (byte) (extra_size + 1);
+		} else {
+			ut_ad((extra_size + 1) < 0x8000);
+			*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+			*b++ = (byte) (extra_size + 1);
+		}
+
+		ut_ad(b + size < block[1]);
+
+		rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
+					       REC_STATUS_ORDINARY,
+					       entry, n_fields);
+
+		b += size;
+
+#ifdef UNIV_DEBUG
+		if (row_merge_print_write) {
+			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
+				(void*) b, of->fd, (ulong) of->offset,
+				(ulong) i);
+			row_merge_tuple_print(stderr, entry, n_fields);
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Write an "end-of-chunk" marker. */
+	ut_a(b < block[1]);
+	ut_a(b == block[0] + buf->total_size);
+	*b++ = 0;
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, block[1] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
+			(void*) b, of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets().
+@return	memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint**			offsets1,	/*!< out: offsets */
+	ulint**			offsets2)	/*!< out: offsets */
+{
+	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof *offsets1);
+
+	*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
+	*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
+
+	(*offsets1)[0] = (*offsets2)[0] = i;
+	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
+
+	return(heap);
+}
+
+/**********************************************************************//**
+Search an index object by name and column names.  If several indexes match,
+return the index with the max id.
+@return	matching index, NULL if not found */
+static
+dict_index_t*
+row_merge_dict_table_get_index(
+/*===========================*/
+	dict_table_t*		table,		/*!< in: table */
+	const merge_index_def_t*index_def)	/*!< in: index definition */
+{
+	ulint		i;
+	dict_index_t*	index;
+	const char**	column_names;
+
+	column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
+
+	for (i = 0; i < index_def->n_fields; ++i) {
+		column_names[i] = index_def->fields[i].field_name;
+	}
+
+	index = dict_table_get_index_by_max_id(
+		table, index_def->name, column_names, index_def->n_fields);
+
+	mem_free((void*) column_names);
+
+	return(index);
+}
+
+/********************************************************************//**
+Read a merge block from the file system.
+@return	TRUE if request was successful, FALSE if fail */
+static
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read */
+	row_merge_block_t*	buf)	/*!< out: data */
+{
+	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
+	ibool		success;
+
+	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
+						 (ulint) (ofs & 0xFFFFFFFF),
+						 (ulint) (ofs >> 32),
+						 sizeof *buf);
+	if (UNIV_UNLIKELY(!success)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: failed to read merge block at %llu\n", ofs);
+	}
+
+	return(UNIV_LIKELY(success));
+}
+
+/********************************************************************//**
+Read a merge block from the file system.
+@return	TRUE if request was successful, FALSE if fail */
+static
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write */
+	const void*	buf)	/*!< in: data */
+{
+	ib_uint64_t	ofs = ((ib_uint64_t) offset)
+		* sizeof(row_merge_block_t);
+
+	return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
+					 (ulint) (ofs & 0xFFFFFFFF),
+					 (ulint) (ofs >> 32),
+					 sizeof(row_merge_block_t))));
+}
+
+/********************************************************************//**
+Read a merge record.
+@return	pointer to next record, or NULL on I/O error or end of list */
+static
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets)/*!< out: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= block[0]);
+	ut_ad(b < block[1]);
+	ut_ad(index);
+	ut_ad(foffs);
+	ut_ad(mrec);
+	ut_ad(offsets);
+
+	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
+	      + dict_index_get_n_fields(index));
+
+	extra_size = *b++;
+
+	if (UNIV_UNLIKELY(!extra_size)) {
+		/* End of list */
+		*mrec = NULL;
+#ifdef UNIV_DEBUG
+		if (row_merge_print_read) {
+			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
+				(const void*) b, (const void*) block,
+				fd, (ulong) *foffs);
+		}
+#endif /* UNIV_DEBUG */
+		return(NULL);
+	}
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		if (UNIV_UNLIKELY(b >= block[1])) {
+			if (!row_merge_read(fd, ++(*foffs), block)) {
+err_exit:
+				/* Signal I/O error. */
+				*mrec = b;
+				return(NULL);
+			}
+
+			/* Wrap around to the beginning of the buffer. */
+			b = block[0];
+		}
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *b++;
+	}
+
+	/* Normalize extra_size.  Above, value 0 signals "end of list". */
+	extra_size--;
+
+	/* Read the extra bytes. */
+
+	if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
+		/* The record spans two blocks.  Copy the entire record
+		to the auxiliary buffer and handle this as a special
+		case. */
+
+		avail_size = block[1] - b;
+
+		memcpy(*buf, b, avail_size);
+
+		if (!row_merge_read(fd, ++(*foffs), block)) {
+
+			goto err_exit;
+		}
+
+		/* Wrap around to the beginning of the buffer. */
+		b = block[0];
+
+		/* Copy the record. */
+		memcpy(*buf + avail_size, b, extra_size - avail_size);
+		b += extra_size - avail_size;
+
+		*mrec = *buf + extra_size;
+
+		rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
+
+		data_size = rec_offs_data_size(offsets);
+
+		/* These overflows should be impossible given that
+		records are much smaller than either buffer, and
+		the record starts near the beginning of each buffer. */
+		ut_a(extra_size + data_size < sizeof *buf);
+		ut_a(b + data_size < block[1]);
+
+		/* Copy the data bytes. */
+		memcpy(*buf + extra_size, b, data_size);
+		b += data_size;
+
+		goto func_exit;
+	}
+
+	*mrec = b + extra_size;
+
+	rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
+
+	data_size = rec_offs_data_size(offsets);
+	ut_ad(extra_size + data_size < sizeof *buf);
+
+	b += extra_size + data_size;
+
+	if (UNIV_LIKELY(b < block[1])) {
+		/* The record fits entirely in the block.
+		This is the normal case. */
+		goto func_exit;
+	}
+
+	/* The record spans two blocks.  Copy it to buf. */
+
+	b -= extra_size + data_size;
+	avail_size = block[1] - b;
+	memcpy(*buf, b, avail_size);
+	*mrec = *buf + extra_size;
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() here, because there
+	are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
+	Similarly, rec_offs_validate() would fail, because it invokes
+	rec_get_status(). */
+	offsets[2] = (ulint) *mrec;
+	offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+	if (!row_merge_read(fd, ++(*foffs), block)) {
+
+		goto err_exit;
+	}
+
+	/* Wrap around to the beginning of the buffer. */
+	b = block[0];
+
+	/* Copy the rest of the record. */
+	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+	b += extra_size + data_size - avail_size;
+
+func_exit:
+#ifdef UNIV_DEBUG
+	if (row_merge_print_read) {
+		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
+			(const void*) b, (const void*) block,
+			fd, (ulong) *foffs);
+		rec_print_comp(stderr, *mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+	byte*		b,	/*!< out: buffer */
+	ulint		e,	/*!< in: encoded extra_size */
+#ifdef UNIV_DEBUG
+	ulint		size,	/*!< in: total size to write */
+	int		fd,	/*!< in: file descriptor */
+	ulint		foffs,	/*!< in: file offset */
+#endif /* UNIV_DEBUG */
+	const mrec_t*	mrec,	/*!< in: record to write */
+	const ulint*	offsets)/*!< in: offsets of mrec */
+#ifndef UNIV_DEBUG
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
+	row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* !UNIV_DEBUG */
+{
+#ifdef UNIV_DEBUG
+	const byte* const end = b + size;
+	ut_ad(e == rec_offs_extra_size(offsets) + 1);
+
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%d,%lu ",
+			(void*) b, fd, (ulong) foffs);
+		rec_print_comp(stderr, mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (e < 0x80) {
+		*b++ = (byte) e;
+	} else {
+		*b++ = (byte) (0x80 | (e >> 8));
+		*b++ = (byte) e;
+	}
+
+	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+	ut_ad(b + rec_offs_size(offsets) == end);
+}
+
+/********************************************************************//**
+Write a merge record.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t*		mrec,	/*!< in: record to write */
+	const ulint*		offsets)/*!< in: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= block[0]);
+	ut_ad(b < block[1]);
+	ut_ad(mrec);
+	ut_ad(foffs);
+	ut_ad(mrec < block[0] || mrec > block[1]);
+	ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+	/* Normalize extra_size.  Value 0 signals "end of list". */
+	extra_size = rec_offs_extra_size(offsets) + 1;
+
+	size = extra_size + (extra_size >= 0x80)
+		+ rec_offs_data_size(offsets);
+
+	if (UNIV_UNLIKELY(b + size >= block[1])) {
+		/* The record spans two blocks.
+		Copy it to the temporary buffer first. */
+		avail_size = block[1] - b;
+
+		row_merge_write_rec_low(buf[0],
+					extra_size, size, fd, *foffs,
+					mrec, offsets);
+
+		/* Copy the head of the temporary buffer, write
+		the completed block, and copy the tail of the
+		record to the head of the new block. */
+		memcpy(b, buf[0], avail_size);
+
+		if (!row_merge_write(fd, (*foffs)++, block)) {
+			return(NULL);
+		}
+
+		UNIV_MEM_INVALID(block[0], sizeof block[0]);
+
+		/* Copy the rest. */
+		b = block[0];
+		memcpy(b, buf[0] + avail_size, size - avail_size);
+		b += size - avail_size;
+	} else {
+		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+					mrec, offsets);
+		b += size;
+	}
+
+	return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs)	/*!< in/out: file offset */
+{
+	ut_ad(block);
+	ut_ad(b >= block[0]);
+	ut_ad(b < block[1]);
+	ut_ad(foffs);
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
+			(void*) b, (void*) block, fd, (ulong) *foffs);
+	}
+#endif /* UNIV_DEBUG */
+
+	*b++ = 0;
+	UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
+	UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, block[1] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	if (!row_merge_write(fd, (*foffs)++, block)) {
+		return(NULL);
+	}
+
+	UNIV_MEM_INVALID(block[0], sizeof block[0]);
+	return(block[0]);
+}
+
+/*************************************************************//**
+Compare two merge records.
+@return	1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
+static
+int
+row_merge_cmp(
+/*==========*/
+	const mrec_t*		mrec1,		/*!< in: first merge
+						record to be compared */
+	const mrec_t*		mrec2,		/*!< in: second merge
+						record to be compared */
+	const ulint*		offsets1,	/*!< in: first record offsets */
+	const ulint*		offsets2,	/*!< in: second record offsets */
+	const dict_index_t*	index)		/*!< in: index */
+{
+	int	cmp;
+
+	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_cmp) {
+		fputs("row_merge_cmp1 ", stderr);
+		rec_print_comp(stderr, mrec1, offsets1);
+		fputs("\nrow_merge_cmp2 ", stderr);
+		rec_print_comp(stderr, mrec2, offsets2);
+		fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(cmp);
+}
+
+/********************************************************************//**
+Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@return	DB_SUCCESS or error */
+static
+ulint
+row_merge_read_clustered_index(
+/*===========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	TABLE*			table,	/*!< in/out: MySQL table object,
+					for reporting erroneous records */
+	const dict_table_t*	old_table,/*!< in: table where rows are
+					read from */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	dict_index_t**		index,	/*!< in: indexes to be created */
+	merge_file_t*		files,	/*!< in: temporary files */
+	ulint			n_index,/*!< in: number of indexes to create */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	dict_index_t*		clust_index;	/* Clustered index */
+	mem_heap_t*		row_heap;	/* Heap memory to create
+						clustered index records */
+	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
+	btr_pcur_t		pcur;		/* Persistent cursor on the
+						clustered index */
+	mtr_t			mtr;		/* Mini transaction */
+	ulint			err = DB_SUCCESS;/* Return code */
+	ulint			i;
+	ulint			n_nonnull = 0;	/* number of columns
+						changed to NOT NULL */
+	ulint*			nonnull = NULL;	/* NOT NULL columns */
+
+	trx->op_info = "reading clustered index";
+
+	ut_ad(trx);
+	ut_ad(old_table);
+	ut_ad(new_table);
+	ut_ad(index);
+	ut_ad(files);
+
+	/* Create and initialize memory for record buffers */
+
+	merge_buf = mem_alloc(n_index * sizeof *merge_buf);
+
+	for (i = 0; i < n_index; i++) {
+		merge_buf[i] = row_merge_buf_create(index[i]);
+	}
+
+	mtr_start(&mtr);
+
+	/* Find the clustered index and create a persistent cursor
+	based on that. */
+
+	clust_index = dict_table_get_first_index(old_table);
+
+	btr_pcur_open_at_index_side(
+		TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	if (UNIV_UNLIKELY(old_table != new_table)) {
+		ulint	n_cols = dict_table_get_n_cols(old_table);
+
+		/* A primary key will be created.  Identify the
+		columns that were flagged NOT NULL in the new table,
+		so that we can quickly check that the records in the
+		(old) clustered index do not violate the added NOT
+		NULL constraints. */
+
+		ut_a(n_cols == dict_table_get_n_cols(new_table));
+
+		nonnull = mem_alloc(n_cols * sizeof *nonnull);
+
+		for (i = 0; i < n_cols; i++) {
+			if (dict_table_get_nth_col(old_table, i)->prtype
+			    & DATA_NOT_NULL) {
+
+				continue;
+			}
+
+			if (dict_table_get_nth_col(new_table, i)->prtype
+			    & DATA_NOT_NULL) {
+
+				nonnull[n_nonnull++] = i;
+			}
+		}
+
+		if (!n_nonnull) {
+			mem_free(nonnull);
+			nonnull = NULL;
+		}
+	}
+
+	row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+	/* Scan the clustered index. */
+	for (;;) {
+		const rec_t*	rec;
+		ulint*		offsets;
+		dtuple_t*	row		= NULL;
+		row_ext_t*	ext;
+		ibool		has_next	= TRUE;
+
+		btr_pcur_move_to_next_on_page(&pcur);
+
+		/* When switching pages, commit the mini-transaction
+		in order to release the latch on the old page. */
+
+		if (btr_pcur_is_after_last_on_page(&pcur)) {
+			btr_pcur_store_position(&pcur, &mtr);
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			btr_pcur_restore_position(BTR_SEARCH_LEAF,
+						  &pcur, &mtr);
+			has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+		}
+
+		if (UNIV_LIKELY(has_next)) {
+			rec = btr_pcur_get_rec(&pcur);
+			offsets = rec_get_offsets(rec, clust_index, NULL,
+						  ULINT_UNDEFINED, &row_heap);
+
+			/* Skip delete marked records. */
+			if (rec_get_deleted_flag(
+				    rec, dict_table_is_comp(old_table))) {
+				continue;
+			}
+
+			srv_n_rows_inserted++;
+
+			/* Build a row based on the clustered index. */
+
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					rec, offsets,
+					new_table, &ext, row_heap);
+
+			if (UNIV_LIKELY_NULL(nonnull)) {
+				for (i = 0; i < n_nonnull; i++) {
+					dfield_t*	field
+						= &row->fields[nonnull[i]];
+					dtype_t*	field_type
+						= dfield_get_type(field);
+
+					ut_a(!(field_type->prtype
+					       & DATA_NOT_NULL));
+
+					if (dfield_is_null(field)) {
+						err = DB_PRIMARY_KEY_IS_NULL;
+						i = 0;
+						goto err_exit;
+					}
+
+					field_type->prtype |= DATA_NOT_NULL;
+				}
+			}
+		}
+
+		/* Build all entries for all the indexes to be created
+		in a single scan of the clustered index. */
+
+		for (i = 0; i < n_index; i++) {
+			row_merge_buf_t*	buf	= merge_buf[i];
+			merge_file_t*		file	= &files[i];
+			const dict_index_t*	index	= buf->index;
+
+			if (UNIV_LIKELY
+			    (row && row_merge_buf_add(buf, row, ext))) {
+				continue;
+			}
+
+			/* The buffer must be sufficiently large
+			to hold at least one record. */
+			ut_ad(buf->n_tuples || !has_next);
+
+			/* We have enough data tuples to form a block.
+			Sort them and write to disk. */
+
+			if (buf->n_tuples) {
+				if (dict_index_is_unique(index)) {
+					row_merge_dup_t	dup;
+					dup.index = buf->index;
+					dup.table = table;
+					dup.n_dup = 0;
+
+					row_merge_buf_sort(buf, &dup);
+
+					if (dup.n_dup) {
+						err = DB_DUPLICATE_KEY;
+err_exit:
+						trx->error_key_num = i;
+						goto func_exit;
+					}
+				} else {
+					row_merge_buf_sort(buf, NULL);
+				}
+			}
+
+			row_merge_buf_write(buf, file, block);
+
+			if (!row_merge_write(file->fd, file->offset++,
+					     block)) {
+				err = DB_OUT_OF_FILE_SPACE;
+				goto err_exit;
+			}
+
+			UNIV_MEM_INVALID(block[0], sizeof block[0]);
+			merge_buf[i] = row_merge_buf_empty(buf);
+
+			/* Try writing the record again, now that
+			the buffer has been written out and emptied. */
+
+			if (UNIV_UNLIKELY
+			    (row && !row_merge_buf_add(buf, row, ext))) {
+				/* An empty buffer should have enough
+				room for at least one record. */
+				ut_error;
+			}
+		}
+
+		mem_heap_empty(row_heap);
+
+		if (UNIV_UNLIKELY(!has_next)) {
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(row_heap);
+
+	if (UNIV_LIKELY_NULL(nonnull)) {
+		mem_free(nonnull);
+	}
+
+	for (i = 0; i < n_index; i++) {
+		row_merge_buf_free(merge_buf[i]);
+	}
+
+	mem_free(merge_buf);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N	number of the buffer (0 or 1)
+@param AT_END	statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END)				\
+	do {								\
+		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
+					 of->fd, &of->offset,		\
+					 mrec##N, offsets##N);		\
+		if (UNIV_UNLIKELY(!b2)) {				\
+			goto corrupt;					\
+		}							\
+		b##N = row_merge_read_rec(&block[N], &buf[N],		\
+					  b##N, index,			\
+					  file->fd, foffs##N,		\
+					  &mrec##N, offsets##N);	\
+		if (UNIV_UNLIKELY(!b##N)) {				\
+			if (mrec##N) {					\
+				goto corrupt;				\
+			}						\
+			AT_END;						\
+		}							\
+	} while (0)
+
+/*************************************************************//**
+Merge two blocks of linked lists on disk and write a bigger block.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge_blocks(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: offset of first
+					source list in the file */
+	ulint*			foffs1,	/*!< in/out: offset of second
+					source list in the file */
+	merge_file_t*		of,	/*!< in/out: output file */
+	TABLE*			table)	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t	buf[3];	/*!< buffer for handling split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	const byte*	b1;	/*!< pointer to block[1] */
+	byte*		b2;	/*!< pointer to block[2] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
+	const mrec_t*	mrec1;	/*!< merge rec, points to block[1] or buf[1] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* offsets of mrec1 */
+
+	heap = row_merge_heap_create(index, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])
+	    || !row_merge_read(file->fd, *foffs1, &block[1])) {
+corrupt:
+		mem_heap_free(heap);
+		return(DB_CORRUPTION);
+	}
+
+	b0 = block[0];
+	b1 = block[1];
+	b2 = block[2];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
+				foffs0, &mrec0, offsets0);
+	b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
+				foffs1, &mrec1, offsets1);
+	if (UNIV_UNLIKELY(!b0 && mrec0)
+	    || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+		goto corrupt;
+	}
+
+	while (mrec0 && mrec1) {
+		switch (row_merge_cmp(mrec0, mrec1,
+				      offsets0, offsets1, index)) {
+		case 0:
+			if (UNIV_UNLIKELY
+			    (dict_index_is_unique(index))) {
+				innobase_rec_to_mysql(table, mrec0,
+						      index, offsets0);
+				mem_heap_free(heap);
+				return(DB_DUPLICATE_KEY);
+			}
+			/* fall through */
+		case -1:
+			ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
+			break;
+		case 1:
+			ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
+			break;
+		default:
+			ut_error;
+		}
+
+	}
+
+merged:
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
+		}
+	}
+done0:
+	if (mrec1) {
+		/* append all mrec1 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
+		}
+	}
+done1:
+
+	mem_heap_free(heap);
+	b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
+	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge(
+/*======*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	ulint			half,	/*!< in: half the file */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	TABLE*			table)	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	ulint		foffs0;	/*!< first input offset */
+	ulint		foffs1;	/*!< second input offset */
+	ulint		error;	/*!< error code */
+	merge_file_t	of;	/*!< output file */
+
+	UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
+	ut_ad(half > 0);
+
+	of.fd = *tmpfd;
+	of.offset = 0;
+
+	/* Merge blocks to the output file. */
+	foffs0 = 0;
+	foffs1 = half;
+
+	for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
+		error = row_merge_blocks(index, file, block,
+					 &foffs0, &foffs1, &of, table);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+	}
+
+	/* Copy the last block, if there is one. */
+	while (foffs0 < half) {
+		if (!row_merge_read(file->fd, foffs0++, block)
+		    || !row_merge_write(of.fd, of.offset++, block)) {
+			return(DB_CORRUPTION);
+		}
+	}
+	while (foffs1 < file->offset) {
+		if (!row_merge_read(file->fd, foffs1++, block)
+		    || !row_merge_write(of.fd, of.offset++, block)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	/* Swap file descriptors for the next pass. */
+	*tmpfd = file->fd;
+	*file = of;
+
+	UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge_sort(
+/*===========*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	TABLE*			table)	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	ulint	blksz;	/*!< block size */
+
+	for (blksz = 1; blksz < file->offset; blksz *= 2) {
+		ulint	half;
+		ulint	error;
+
+		ut_ad(ut_is_2pow(blksz));
+		half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
+		error = row_merge(index, file, half, block, tmpfd, table);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Copy externally stored columns to the data tuple. */
+static
+void
+row_merge_copy_blobs(
+/*=================*/
+	const mrec_t*	mrec,	/*!< in: merge record */
+	const ulint*	offsets,/*!< in: offsets of mrec */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	mem_heap_t*	heap)	/*!< in/out: memory heap */
+{
+	ulint	i;
+	ulint	n_fields = dtuple_get_n_fields(tuple);
+
+	for (i = 0; i < n_fields; i++) {
+		ulint		len;
+		const void*	data;
+		dfield_t*	field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_is_ext(field)) {
+			continue;
+		}
+
+		ut_ad(!dfield_is_null(field));
+
+		/* The table is locked during index creation.
+		Therefore, externally stored columns cannot possibly
+		be freed between the time the BLOB pointers are read
+		(row_merge_read_clustered_index()) and dereferenced
+		(below). */
+		data = btr_rec_copy_externally_stored_field(
+			mrec, offsets, zip_size, i, &len, heap);
+
+		dfield_set_data(field, data, len);
+	}
+}
+
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+static
+ulint
+row_merge_insert_index_tuples(
+/*==========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	ulint			zip_size,/*!< in: compressed page size of
+					 the old table, or 0 if uncompressed */
+	int			fd,	/*!< in: file descriptor */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	mrec_buf_t		buf;
+	const byte*		b;
+	que_thr_t*		thr;
+	ins_node_t*		node;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		graph_heap;
+	ulint			error = DB_SUCCESS;
+	ulint			foffs = 0;
+	ulint*			offsets;
+
+	ut_ad(trx);
+	ut_ad(index);
+	ut_ad(table);
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	trx->op_info = "inserting index entries";
+
+	graph_heap = mem_heap_create(500);
+	node = ins_node_create(INS_DIRECT, table, graph_heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, graph_heap);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	tuple_heap = mem_heap_create(1000);
+
+	{
+		ulint i	= 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
+		offsets[0] = i;
+		offsets[1] = dict_index_get_n_fields(index);
+	}
+
+	b = *block;
+
+	if (!row_merge_read(fd, foffs, block)) {
+		error = DB_CORRUPTION;
+	} else {
+		for (;;) {
+			const mrec_t*	mrec;
+			dtuple_t*	dtuple;
+			ulint		n_ext;
+
+			b = row_merge_read_rec(block, &buf, b, index,
+					       fd, &foffs, &mrec, offsets);
+			if (UNIV_UNLIKELY(!b)) {
+				/* End of list, or I/O error */
+				if (mrec) {
+					error = DB_CORRUPTION;
+				}
+				break;
+			}
+
+			dtuple = row_rec_to_index_entry_low(
+				mrec, index, offsets, &n_ext, tuple_heap);
+
+			if (UNIV_UNLIKELY(n_ext)) {
+				row_merge_copy_blobs(mrec, offsets, zip_size,
+						     dtuple, tuple_heap);
+			}
+
+			node->row = dtuple;
+			node->table = table;
+			node->trx_id = trx->id;
+
+			ut_ad(dtuple_validate(dtuple));
+
+			do {
+				thr->run_node = thr;
+				thr->prev_node = thr->common.parent;
+
+				error = row_ins_index_entry(index, dtuple,
+							    0, FALSE, thr);
+
+				if (UNIV_LIKELY(error == DB_SUCCESS)) {
+
+					goto next_rec;
+				}
+
+				thr->lock_state = QUE_THR_LOCK_ROW;
+				trx->error_state = error;
+				que_thr_stop_for_mysql(thr);
+				thr->lock_state = QUE_THR_LOCK_NOLOCK;
+			} while (row_mysql_handle_errors(&error, trx,
+							 thr, NULL));
+
+			goto err_exit;
+next_rec:
+			mem_heap_empty(tuple_heap);
+		}
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+err_exit:
+	que_graph_free(thr->graph);
+
+	trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+	sel_node_t*	node;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = "setting table lock for creating or dropping index";
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(que_node_get_parent(thr));
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			ibool	was_lock_wait;
+
+			was_lock_wait = row_mysql_handle_errors(
+				&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+			run_thr = que_fork_start_command(parent);
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drop an index from the InnoDB system tables.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_index(
+/*=================*/
+	dict_index_t*	index,	/*!< in: index to be removed */
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	ulint		err;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	static const char str1[] =
+		"PROCEDURE DROP_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
+		"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
+		"		AND TABLE_ID = :tableid;\n"
+		"END;\n";
+
+	ut_ad(index && table && trx);
+
+	pars_info_add_dulint_literal(info, "indexid", index->id);
+	pars_info_add_dulint_literal(info, "tableid", table->id);
+
+	trx_start_if_not_started(trx);
+	trx->op_info = "dropping index";
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	err = que_eval_sql(info, str1, FALSE, trx);
+
+	ut_a(err == DB_SUCCESS);
+
+	/* Replace this index with another equivalent index for all
+	foreign key constraints on this table where this index is used */
+
+	dict_table_replace_index_in_foreign_list(table, index);
+	dict_index_remove_from_cache(table, index);
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred when
+building an index.  The data dictionary must have been locked
+exclusively by the caller, because the transaction will not be
+committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table,		/*!< in: table containing the indexes */
+	dict_index_t**	index,		/*!< in: indexes to drop */
+	ulint		num_created)	/*!< in: number of elements in index[] */
+{
+	ulint	key_num;
+
+	for (key_num = 0; key_num < num_created; key_num++) {
+		row_merge_drop_index(index[key_num], table, trx);
+	}
+}
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void)
+/*=============================*/
+{
+	trx_t*		trx;
+	ulint		err;
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+	static const char drop_temp_indexes[] =
+		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+		"indexid CHAR;\n"
+		"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
+		"WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
+		"BEGIN\n"
+		"\tOPEN c;\n"
+		"\tWHILE 1=1 LOOP\n"
+		"\t\tFETCH c INTO indexid;\n"
+		"\t\tIF (SQL % NOTFOUND) THEN\n"
+		"\t\t\tEXIT;\n"
+		"\t\tEND IF;\n"
+		"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
+		"\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
+		"\tEND LOOP;\n"
+		"\tCLOSE c;\n"
+		"\tCOMMIT WORK;\n"
+		"END;\n";
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping partially created indexes";
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Incomplete transactions may be holding some locks on the
+	data dictionary tables.  However, they should never have been
+	able to lock the records corresponding to the partially
+	created indexes that we are attempting to delete, because the
+	table was locked when the indexes were being created.  We will
+	drop the partially created indexes before the rollback of
+	incomplete transactions is initiated.  Thus, this should not
+	interfere with the incomplete transactions. */
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+	err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
+	ut_a(err == DB_SUCCESS);
+
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/*********************************************************************//**
+Create a merge file. */
+static
+void
+row_merge_file_create(
+/*==================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+{
+	merge_file->fd = innobase_mysql_tmpfile();
+	merge_file->offset = 0;
+}
+
+/*********************************************************************//**
+Destroy a merge file. */
+static
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+{
+	if (merge_file->fd != -1) {
+		close(merge_file->fd);
+		merge_file->fd = -1;
+	}
+}
+
+/*********************************************************************//**
+Determine the precise type of a column that is added to a tem
+if a column must be constrained NOT NULL.
+@return	col->prtype, possibly ORed with DATA_NOT_NULL */
+UNIV_INLINE
+ulint
+row_merge_col_prtype(
+/*=================*/
+	const dict_col_t*	col,		/*!< in: column */
+	const char*		col_name,	/*!< in: name of the column */
+	const merge_index_def_t*index_def)	/*!< in: the index definition
+						of the primary key */
+{
+	ulint	prtype = col->prtype;
+	ulint	i;
+
+	ut_ad(index_def->ind_type & DICT_CLUSTERED);
+
+	if (prtype & DATA_NOT_NULL) {
+
+		return(prtype);
+	}
+
+	/* All columns that are included
+	in the PRIMARY KEY must be NOT NULL. */
+
+	for (i = 0; i < index_def->n_fields; i++) {
+		if (!strcmp(col_name, index_def->fields[i].field_name)) {
+			return(prtype | DATA_NOT_NULL);
+		}
+	}
+
+	return(prtype);
+}
+
+/*********************************************************************//**
+Create a temporary table for creating a primary key, using the definition
+of an existing table.
+@return	table, or NULL on error */
+UNIV_INTERN
+dict_table_t*
+row_merge_create_temporary_table(
+/*=============================*/
+	const char*		table_name,	/*!< in: new table name */
+	const merge_index_def_t*index_def,	/*!< in: the index definition
+						of the primary key */
+	const dict_table_t*	table,		/*!< in: old table definition */
+	trx_t*			trx)		/*!< in/out: transaction
+						(sets error_state) */
+{
+	ulint		i;
+	dict_table_t*	new_table = NULL;
+	ulint		n_cols = dict_table_get_n_user_cols(table);
+	ulint		error;
+	mem_heap_t*	heap = mem_heap_create(1000);
+
+	ut_ad(table_name);
+	ut_ad(index_def);
+	ut_ad(table);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
+
+	for (i = 0; i < n_cols; i++) {
+		const dict_col_t*	col;
+		const char*		col_name;
+
+		col = dict_table_get_nth_col(table, i);
+		col_name = dict_table_get_col_name(table, i);
+
+		dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
+				       row_merge_col_prtype(col, col_name,
+							    index_def),
+				       col->len);
+	}
+
+	error = row_create_table_for_mysql(new_table, trx);
+	mem_heap_free(heap);
+
+	if (error != DB_SUCCESS) {
+		trx->error_state = error;
+		new_table = NULL;
+	}
+
+	return(new_table);
+}
+
+/*********************************************************************//**
+Rename the temporary indexes in the dictionary to permanent ones.  The
+data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+row_merge_rename_indexes(
+/*=====================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table)		/*!< in/out: table with new indexes */
+{
+	ulint		err = DB_SUCCESS;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_indexes[] =
+		"PROCEDURE RENAME_INDEXES_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+		"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
+		TEMP_INDEX_PREFIX_STR "';\n"
+		"END;\n";
+
+	ut_ad(table);
+	ut_ad(trx);
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	trx->op_info = "renaming indexes";
+
+	pars_info_add_dulint_literal(info, "tableid", table->id);
+
+	err = que_eval_sql(info, rename_indexes, FALSE, trx);
+
+	if (err == DB_SUCCESS) {
+		dict_index_t*	index = dict_table_get_first_index(table);
+		do {
+			if (*index->name == TEMP_INDEX_PREFIX) {
+				index->name++;
+			}
+			index = dict_table_get_next_index(index);
+		} while (index);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Rename the tables in the data dictionary.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_rename_tables(
+/*====================*/
+	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
+					tmp_name */
+	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
+					old_table->name */
+	const char*	tmp_name,	/*!< in: new name for old_table */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	ulint		err	= DB_ERROR;
+	pars_info_t*	info;
+	const char*	old_name= old_table->name;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(old_table != new_table);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	trx->op_info = "renaming tables";
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data in system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_name", new_table->name);
+	pars_info_add_str_literal(info, "old_name", old_name);
+	pars_info_add_str_literal(info, "tmp_name", tmp_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLES () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
+			   " WHERE NAME = :old_name;\n"
+			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
+			   " WHERE NAME = :new_name;\n"
+			   "END;\n", FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+
+		goto err_exit;
+	}
+
+	/* The following calls will also rename the .ibd data files if
+	the tables are stored in a single-table tablespace */
+
+	if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
+	    || !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
+
+		err = DB_ERROR;
+		goto err_exit;
+	}
+
+	err = dict_load_foreigns(old_name, TRUE);
+
+	if (err != DB_SUCCESS) {
+err_exit:
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create and execute a query graph for creating an index.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge_create_index_graph(
+/*=========================*/
+	trx_t*		trx,		/*!< in: trx */
+	dict_table_t*	table,		/*!< in: table */
+	dict_index_t*	index)		/*!< in: index */
+{
+	ind_node_t*	node;		/*!< Index creation node */
+	mem_heap_t*	heap;		/*!< Memory heap */
+	que_thr_t*	thr;		/*!< Query thread */
+	ulint		err;
+
+	ut_ad(trx);
+	ut_ad(table);
+	ut_ad(index);
+
+	heap = mem_heap_create(512);
+
+	index->table = table;
+	node = ind_create_graph_create(index, heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return	index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
+	dict_table_t*		table,	/*!< in: the index is on this table */
+	const merge_index_def_t*index_def)
+					/*!< in: the index definition */
+{
+	dict_index_t*	index;
+	ulint		err;
+	ulint		n_fields = index_def->n_fields;
+	ulint		i;
+
+	/* Create the index prototype, using the passed in def, this is not
+	a persistent operation. We pass 0 as the space id, and determine at
+	a lower level the space id where to store the table. */
+
+	index = dict_mem_index_create(table->name, index_def->name,
+				      0, index_def->ind_type, n_fields);
+
+	ut_a(index);
+
+	for (i = 0; i < n_fields; i++) {
+		merge_index_field_t*	ifield = &index_def->fields[i];
+
+		dict_mem_index_add_field(index, ifield->field_name,
+					 ifield->prefix_len);
+	}
+
+	/* Add the index to SYS_INDEXES, using the index prototype. */
+	err = row_merge_create_index_graph(trx, table, index);
+
+	if (err == DB_SUCCESS) {
+
+		index = row_merge_dict_table_get_index(
+			table, index_def);
+
+		ut_a(index);
+
+		/* Note the id of the transaction that created this
+		index, we use it to restrict readers from accessing
+		this index, to ensure read consistency. */
+		index->trx_id = (ib_uint64_t)
+			ut_conv_dulint_to_longlong(trx->id);
+	} else {
+		index = NULL;
+	}
+
+	return(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+{
+	return(!trx->read_view || read_view_sees_trx_id(
+		       trx->read_view,
+		       ut_dulint_create((ulint) (index->trx_id >> 32),
+					(ulint) index->trx_id & 0xFFFFFFFF)));
+}
+
+/*********************************************************************//**
+Drop the old table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_drop_table(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table to drop */
+{
+	/* There must be no open transactions on the table. */
+	ut_a(table->n_mysql_handles_opened == 0);
+
+	return(row_drop_table_for_mysql(table->name, trx, FALSE));
+}
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_build_indexes(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	old_table,	/*!< in: table where rows are
+					read from */
+	dict_table_t*	new_table,	/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	ulint		n_indexes,	/*!< in: size of indexes[] */
+	TABLE*		table)		/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	merge_file_t*		merge_files;
+	row_merge_block_t*	block;
+	ulint			block_size;
+	ulint			i;
+	ulint			error;
+	int			tmpfd;
+
+	ut_ad(trx);
+	ut_ad(old_table);
+	ut_ad(new_table);
+	ut_ad(indexes);
+	ut_ad(n_indexes);
+
+	trx_start_if_not_started(trx);
+
+	/* Allocate memory for merge file data structure and initialize
+	fields */
+
+	merge_files = mem_alloc(n_indexes * sizeof *merge_files);
+	block_size = 3 * sizeof *block;
+	block = os_mem_alloc_large(&block_size);
+
+	for (i = 0; i < n_indexes; i++) {
+
+		row_merge_file_create(&merge_files[i]);
+	}
+
+	tmpfd = innobase_mysql_tmpfile();
+
+	/* Reset the MySQL row buffer that is used when reporting
+	duplicate keys. */
+	innobase_rec_reset(table);
+
+	/* Read clustered index of the table and create files for
+	secondary index entries for merge sort */
+
+	error = row_merge_read_clustered_index(
+		trx, table, old_table, new_table, indexes,
+		merge_files, n_indexes, block);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Now we have files containing index entries ready for
+	sorting and inserting. */
+
+	for (i = 0; i < n_indexes; i++) {
+		error = row_merge_sort(indexes[i], &merge_files[i],
+				       block, &tmpfd, table);
+
+		if (error == DB_SUCCESS) {
+			error = row_merge_insert_index_tuples(
+				trx, indexes[i], new_table,
+				dict_table_zip_size(old_table),
+				merge_files[i].fd, block);
+		}
+
+		/* Close the temporary file to free up space. */
+		row_merge_file_destroy(&merge_files[i]);
+
+		if (error != DB_SUCCESS) {
+			trx->error_key_num = i;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	close(tmpfd);
+
+	for (i = 0; i < n_indexes; i++) {
+		row_merge_file_destroy(&merge_files[i]);
+	}
+
+	mem_free(merge_files);
+	os_mem_free_large(block, block_size);
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
new file mode 100644
index 00000000000..b345bb59624
--- /dev/null
+++ b/storage/innobase/row/row0mysql.c
@@ -0,0 +1,4241 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.c
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "row0ins.h"
+#include "row0merge.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "log0log.h"
+#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
+
+/** Provide optional 4.x backwards compatibility for 5.0 and above */
+UNIV_INTERN ibool	row_rollback_on_timeout	= FALSE;
+
+/** Chain node of the list of tables to drop in the background. */
+typedef struct row_mysql_drop_struct	row_mysql_drop_t;
+
+/** Chain node of the list of tables to drop in the background. */
+struct row_mysql_drop_struct{
+	char*				table_name;	/*!< table name */
+	UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
+							/*!< list chain node */
+};
+
+/** @brief List of tables we should drop in background.
+
+ALTER TABLE in MySQL requires that the table handler can drop the
+table in background when there are no queries to it any
+more.  Protected by kernel_mutex. */
+static UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
+/** Flag: has row_mysql_drop_list been initialized? */
+static ibool	row_mysql_drop_list_inited	= FALSE;
+
+/** Magic table names for invoking various monitor threads */
+/* @{ */
+static const char S_innodb_monitor[] = "innodb_monitor";
+static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
+static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
+static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+/* @} */
+
+/** Evaluates to true if str1 equals str2_onstack, used for comparing
+the magic table names.
+@param str1		in: string to compare
+@param str1_len 	in: length of str1, in bytes, including terminating NUL
+@param str2_onstack	in: char[] array containing a NUL terminated string
+@return			TRUE if str1 equals str2_onstack */
+#define STR_EQ(str1, str1_len, str2_onstack) \
+	((str1_len) == sizeof(str2_onstack) \
+	 && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0)
+
+/*******************************************************************//**
+Determine if the given name is a name reserved for MySQL system tables.
+@return	TRUE if name is a MySQL system table name */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+	const char*	name)
+{
+	if (strncmp(name, "mysql/", 6) != 0) {
+
+		return(FALSE);
+	}
+
+	return(0 == strcmp(name + 6, "host")
+	       || 0 == strcmp(name + 6, "user")
+	       || 0 == strcmp(name + 6, "db"));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return	TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+	const char*	name);	/*!< in: table name */
+
+/*******************************************************************//**
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+	if (srv_dml_needed_delay) {
+		os_thread_sleep(srv_dml_needed_delay);
+	}
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	mem_heap_free(prebuilt->blob_heap);
+	prebuilt->blob_heap = NULL;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen)	/*!< in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		ut_a(len < 256 * 256);
+
+		mach_write_to_2_little_endian(dest, len);
+
+		return(dest + 2);
+	}
+
+	ut_a(lenlen == 1);
+	ut_a(len < 256);
+
+	mach_write_to_1(dest, len);
+
+	return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen)	/*!< in: storage length of len: either 1
+				or 2 bytes */
+{
+	if (lenlen == 2) {
+		*len = mach_read_from_2_little_endian(field);
+
+		return(field + 2);
+	}
+
+	ut_a(lenlen == 1);
+
+	*len = mach_read_from_1(field);
+
+	return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len)	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+
+	memset(dest, '\0', col_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_a(col_len - 8 > 1 || len < 256);
+	ut_a(col_len - 8 > 2 || len < 256 * 256);
+	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return	pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len)	/*!< in: BLOB reference length
+					(not BLOB length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c.
+@return	up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp)		/*!< in: nonzero=compact format */
+{
+	const byte*	ptr	= mysql_data;
+	const dtype_t*	dtype;
+	ulint		type;
+	ulint		lenlen;
+
+	dtype = dfield_get_type(dfield);
+
+	type = dtype->mtype;
+
+	if (type == DATA_INT) {
+		/* Store integer data in Innobase in a big-endian format,
+		sign bit negated if the data is a signed integer. In MySQL,
+		integers are stored in a little-endian format. */
+
+		byte*	p = buf + col_len;
+
+		for (;;) {
+			p--;
+			*p = *mysql_data;
+			if (p == buf) {
+				break;
+			}
+			mysql_data++;
+		}
+
+		if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+			*buf ^= 128;
+		}
+
+		ptr = buf;
+		buf += col_len;
+	} else if ((type == DATA_VARCHAR
+		    || type == DATA_VARMYSQL
+		    || type == DATA_BINARY)) {
+
+		if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+			/* The length of the actual data is stored to 1 or 2
+			bytes at the start of the field */
+
+			if (row_format_col) {
+				if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+					lenlen = 2;
+				} else {
+					lenlen = 1;
+				}
+			} else {
+				/* In a MySQL key value, lenlen is always 2 */
+				lenlen = 2;
+			}
+
+			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+							  lenlen);
+		} else {
+			/* Remove trailing spaces from old style VARCHAR
+			columns. */
+
+			/* Handle UCS2 strings differently. */
+			ulint	mbminlen	= dtype_get_mbminlen(dtype);
+
+			ptr = mysql_data;
+
+			if (mbminlen == 2) {
+				/* space=0x0020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~1;
+
+				while (col_len >= 2 && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 2;
+				}
+			} else {
+				ut_a(mbminlen == 1);
+				/* space=0x20 */
+				while (col_len > 0
+				       && ptr[col_len - 1] == 0x20) {
+					col_len--;
+				}
+			}
+		}
+	} else if (comp && type == DATA_MYSQL
+		   && dtype_get_mbminlen(dtype) == 1
+		   && dtype_get_mbmaxlen(dtype) > 1) {
+		/* In some cases we strip trailing spaces from UTF-8 and other
+		multibyte charsets, from FIXED-length CHAR columns, to save
+		space. UTF-8 would otherwise normally use 3 * the string length
+		bytes to store an ASCII string! */
+
+		/* We assume that this CHAR field is encoded in a
+		variable-length character set where spaces have
+		1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+		Consider a CHAR(n) field, a field of n characters.
+		It will contain between n * mbminlen and n * mbmaxlen bytes.
+		We will try to truncate it to n bytes by stripping
+		space padding.	If the field contains single-byte
+		characters only, it will be truncated to n characters.
+		Consider a CHAR(5) field containing the string ".a   "
+		where "." denotes a 3-byte character represented by
+		the bytes "$%&".  After our stripping, the string will
+		be stored as "$%&a " (5 bytes).	 The string ".abc "
+		will be stored as "$%&abc" (6 bytes).
+
+		The space padding will be restored in row0sel.c, function
+		row_sel_field_store_in_mysql_format(). */
+
+		ulint		n_chars;
+
+		ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+		n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+		/* Strip space padding. */
+		while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+			col_len--;
+		}
+	} else if (type == DATA_BLOB && row_format_col) {
+
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+	}
+
+	dfield_set_data(dfield, ptr, col_len);
+
+	return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/*!< in/out: Innobase row where the
+					field type information is already
+					copied there! */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	byte*		mysql_rec)	/*!< in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+{
+	mysql_row_templ_t*	templ;
+	dfield_t*		dfield;
+	ulint			i;
+
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+		dfield = dtuple_get_nth_field(row, i);
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset]
+			    & (byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_null(dfield);
+
+				goto next_column;
+			}
+		}
+
+		row_mysql_store_col_in_innobase_format(
+			dfield,
+			prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+			TRUE, /* MySQL row format data */
+			mysql_rec + templ->mysql_col_offset,
+			templ->mysql_col_len,
+			dict_table_is_comp(prebuilt->table));
+next_column:
+		;
+	}
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running the
+query thread */
+UNIV_INTERN
+ibool
+row_mysql_handle_errors(
+/*====================*/
+	ulint*		new_err,/*!< out: possible new error encountered in
+				lock wait, or if no new error, the value
+				of trx->error_state at the entry of this
+				function */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_savept_t*	savept)	/*!< in: savepoint or NULL */
+{
+	ulint	err;
+
+handle_new_error:
+	err = trx->error_state;
+
+	ut_a(err != DB_SUCCESS);
+
+	trx->error_state = DB_SUCCESS;
+
+	switch (err) {
+	case DB_LOCK_WAIT_TIMEOUT:
+		if (row_rollback_on_timeout) {
+			trx_general_rollback_for_mysql(trx, FALSE, NULL);
+			break;
+		}
+		/* fall through */
+	case DB_DUPLICATE_KEY:
+	case DB_FOREIGN_DUPLICATE_KEY:
+	case DB_TOO_BIG_RECORD:
+	case DB_ROW_IS_REFERENCED:
+	case DB_NO_REFERENCED_ROW:
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+	case DB_OUT_OF_FILE_SPACE:
+		if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}
+		/* MySQL will roll back the latest SQL statement */
+		break;
+	case DB_LOCK_WAIT:
+		srv_suspend_mysql_thread(thr);
+
+		if (trx->error_state != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		return(TRUE);
+
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+		/* Roll back the whole transaction; this resolution was added
+		to version 3.23.43 */
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		break;
+
+	case DB_MUST_GET_MORE_FILE_SPACE:
+		fputs("InnoDB: The database cannot continue"
+		      " operation because of\n"
+		      "InnoDB: lack of space. You must add"
+		      " a new data file to\n"
+		      "InnoDB: my.cnf and restart the database.\n", stderr);
+
+		exit(1);
+
+	case DB_CORRUPTION:
+		fputs("InnoDB: We detected index corruption"
+		      " in an InnoDB type table.\n"
+		      "InnoDB: You have to dump + drop + reimport"
+		      " the table or, in\n"
+		      "InnoDB: a case of widespread corruption,"
+		      " dump all InnoDB\n"
+		      "InnoDB: tables and recreate the"
+		      " whole InnoDB tablespace.\n"
+		      "InnoDB: If the mysqld server crashes"
+		      " after the startup or when\n"
+		      "InnoDB: you dump the tables, look at\n"
+		      "InnoDB: " REFMAN "forcing-recovery.html"
+		      " for help.\n", stderr);
+		break;
+	default:
+		fprintf(stderr, "InnoDB: unknown error code %lu\n",
+			(ulong) err);
+		ut_error;
+	}
+
+	if (trx->error_state != DB_SUCCESS) {
+		*new_err = trx->error_state;
+	} else {
+		*new_err = err;
+	}
+
+	trx->error_state = DB_SUCCESS;
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return	own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table)	/*!< in: Innobase table handle */
+{
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+
+	heap = mem_heap_create(sizeof *prebuilt + 128);
+
+	prebuilt = mem_heap_zalloc(heap, sizeof *prebuilt);
+
+	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+	prebuilt->table = table;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->heap = heap;
+
+	prebuilt->pcur = btr_pcur_create_for_mysql();
+	prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = 99999999;
+
+	prebuilt->search_tuple = dtuple_create(
+		heap, 2 * dict_table_get_n_cols(table));
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Make sure that search_tuple is long enough for clustered index */
+	ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	prebuilt->autoinc_error = 0;
+	prebuilt->autoinc_offset = 0;
+
+	/* Default to 1, we will set the actual value later in 
+	ha_innobase::get_auto_increment(). */
+	prebuilt->autoinc_increment = 1;
+
+	prebuilt->autoinc_last_value = 0;
+
+	return(prebuilt);
+}
+
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
+	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+{
+	ulint	i;
+
+	if (UNIV_UNLIKELY
+	    (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED
+	     || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu,"
+			" magic n2 %lu, table name ",
+			(ulong) prebuilt->magic_n,
+			(ulong) prebuilt->magic_n2);
+		ut_print_name(stderr, NULL, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->magic_n = ROW_PREBUILT_FREED;
+	prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+	btr_pcur_free_for_mysql(prebuilt->pcur);
+	btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+	if (prebuilt->mysql_template) {
+		mem_free(prebuilt->mysql_template);
+	}
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+
+	if (prebuilt->blob_heap) {
+		mem_heap_free(prebuilt->blob_heap);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+
+	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		if (prebuilt->fetch_cache[i] != NULL) {
+
+			if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
+				     (prebuilt->fetch_cache[i]) - 4))
+			    || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
+					(prebuilt->fetch_cache[i])
+					+ prebuilt->mysql_row_len))) {
+				fputs("InnoDB: Error: trying to free"
+				      " a corrupt fetch buffer.\n", stderr);
+
+				mem_analyze_corruption(
+					prebuilt->fetch_cache[i]);
+
+				ut_error;
+			}
+
+			mem_free((prebuilt->fetch_cache[i]) - 4);
+		}
+	}
+
+	dict_table_decrement_handle_count(prebuilt->table, dict_locked);
+
+	mem_heap_free(prebuilt->heap);
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	if (trx->magic_n != TRX_MAGIC_N) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to use a corrupt\n"
+			"InnoDB: trx handle. Magic n %lu\n",
+			(ulong) trx->magic_n);
+
+		mem_analyze_corruption(trx);
+
+		ut_error;
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to use a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return	prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	dict_table_t*	table	= prebuilt->table;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->ins_node == NULL) {
+
+		/* Not called before for this handle: create an insert node
+		and query graph to the prebuilt struct */
+
+		node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+		prebuilt->ins_node = node;
+
+		if (prebuilt->ins_upd_rec_buff == NULL) {
+			prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+				prebuilt->heap, prebuilt->mysql_row_len);
+		}
+
+		row = dtuple_create(prebuilt->heap,
+				    dict_table_get_n_cols(table));
+
+		dict_table_copy_types(row, table);
+
+		ins_node_set_new_row(node, row);
+
+		prebuilt->ins_graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node,
+						     prebuilt->trx,
+						     prebuilt->heap));
+		prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	ulint	counter;
+
+	counter = table->stat_modified_counter;
+
+	table->stat_modified_counter = counter + 1;
+
+	/* Calculate new statistics if 1 / 16 of table has been modified
+	since the last time a statistics batch was run, or if
+	stat_modified_counter > 2 000 000 000 (to avoid wrap-around).
+	We calculate statistics at most every 16th round, since we may have
+	a counter table which is very small and updated very often. */
+
+	if (counter > 2000000000
+	    || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) {
+
+		dict_update_statistics(table);
+	}
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. */
+UNIV_INTERN
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mutex_enter(&kernel_mutex);
+
+	lock_release_autoinc_locks(trx);
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+{
+	trx_t*			trx	= prebuilt->trx;
+	ins_node_t*		node	= prebuilt->ins_node;
+	const dict_table_t*	table	= prebuilt->table;
+	que_thr_t*		thr;
+	ulint			err;
+	ibool			was_lock_wait;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	/* If we already hold an AUTOINC lock on the table then do nothing.
+        Note: We peek at the value of the current owner without acquiring
+	the kernel mutex. **/
+	if (trx == table->autoinc_trx) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "setting auto-inc lock";
+
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	/* We use the insert query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started(trx);
+
+	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_for_mysql(
+/*=====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	dict_table_t*	table,		/*!< in: table to lock, or NULL
+					if prebuilt->table should be
+					locked as
+					prebuilt->select_lock_type */
+	ulint		mode)		/*!< in: lock mode of table
+					(ignored if table==NULL) */
+{
+	trx_t*		trx		= prebuilt->trx;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "setting table lock";
+
+	if (prebuilt->sel_graph == NULL) {
+		/* Build a dummy select query graph */
+		row_prebuild_sel_graph(prebuilt);
+	}
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started(trx);
+
+	if (table) {
+		err = lock_table(0, table, mode, thr);
+	} else {
+		err = lock_table(0, prebuilt->table,
+				 prebuilt->select_lock_type, thr);
+	}
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_insert_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+	trx_t*		trx		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (prebuilt->table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that"
+		      " newraw is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "inserting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started(trx);
+
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+	}
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+							&savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	prebuilt->table->stat_n_rows++;
+
+	srv_n_rows_inserted++;
+
+	if (prebuilt->table->stat_n_rows == 0) {
+		/* Avoid wrap-over */
+		prebuilt->table->stat_n_rows--;
+	}
+
+	row_update_statistics_if_needed(prebuilt->table);
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+
+		prebuilt->sel_graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node,
+						     prebuilt->trx,
+						     prebuilt->heap));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap)	/*!< in: mem heap from which allocated */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(heap);
+
+	node->in_mysql_interface = TRUE;
+	node->is_delete = FALSE;
+	node->searched_update = FALSE;
+	node->select = NULL;
+	node->pcur = btr_pcur_create_for_mysql();
+	node->table = table;
+
+	node->update = upd_create(dict_table_get_n_cols(table), heap);
+
+	node->update_n_fields = dict_table_get_n_cols(table);
+
+	UT_LIST_INIT(node->columns);
+	node->has_clust_rec_x_lock = TRUE;
+	node->cmpl_info = 0;
+
+	node->table_sym = NULL;
+	node->col_assign_list = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return	prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table	= prebuilt->table;
+	upd_node_t*	node;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		node = row_create_update_node_for_mysql(table, prebuilt->heap);
+
+		prebuilt->upd_node = node;
+
+		prebuilt->upd_graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node,
+						     prebuilt->trx,
+						     prebuilt->heap));
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_update_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	ulint		err;
+	que_thr_t*	thr;
+	ibool		was_lock_wait;
+	dict_index_t*	clust_index;
+	/*	ulint		ref_len; */
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	UT_NOT_USED(mysql_rec);
+
+	if (prebuilt->table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that newraw"
+		      " is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "updating or deleting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started(trx);
+
+	node = prebuilt->upd_node;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (prebuilt->pcur->btr_cur.index == clust_index) {
+		btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur);
+	} else {
+		btr_pcur_copy_stored_position(node->pcur,
+					      prebuilt->clust_pcur);
+	}
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	/* MySQL seems to call rnd_pos before updating each row it
+	has cached: we can get the correct cursor position from
+	prebuilt->pcur; NOTE that we cannot build the row reference
+	from mysql_rec if the clustered index was automatically
+	generated for the table: MySQL does not know anything about
+	the row id used as the clustered index key */
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+			trx->op_info = "";
+
+			return((int) err);
+		}
+
+		thr->lock_state= QUE_THR_LOCK_ROW;
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+							&savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	if (node->is_delete) {
+		if (prebuilt->table->stat_n_rows > 0) {
+			prebuilt->table->stat_n_rows--;
+		}
+
+		srv_n_rows_deleted++;
+	} else {
+		srv_n_rows_updated++;
+	}
+
+	row_update_statistics_if_needed(prebuilt->table);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or
+this session is using a READ COMMITTED isolation level. Before
+calling this function we must use trx_reset_new_rec_lock_info() and
+trx_register_new_rec_lock() to store the information which new record locks
+really were set. This function removes a newly set lock under prebuilt->pcur,
+and also under prebuilt->clust_pcur. Currently, this is only used and tested
+in the case of an UPDATE or a DELETE statement, where the row lock is of the
+LOCK_X type.
+Thus, this implements a 'mini-rollback' that releases the latest record
+locks we set.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_unlock_for_mysql(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in MySQL
+					handle */
+	ibool		has_latches_on_recs)/*!< TRUE if called so that we have
+					the latches on the records under pcur
+					and clust_pcur, and we do not need to
+					reposition the cursors. */
+{
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	btr_pcur_t*	clust_pcur	= prebuilt->clust_pcur;
+	trx_t*		trx		= prebuilt->trx;
+
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (UNIV_UNLIKELY
+	    (!srv_locks_unsafe_for_binlog
+	     && trx->isolation_level != TRX_ISO_READ_COMMITTED)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: calling row_unlock_for_mysql though\n"
+			"InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n"
+			"InnoDB: this session is not using"
+			" READ COMMITTED isolation level.\n");
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "unlock_row";
+
+	if (prebuilt->new_rec_locks >= 1) {
+
+		const rec_t*	rec;
+		dict_index_t*	index;
+		trx_id_t	rec_trx_id;
+		mtr_t		mtr;
+
+		mtr_start(&mtr);
+
+		/* Restore the cursor position and find the record */
+
+		if (!has_latches_on_recs) {
+			btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr);
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		index = btr_pcur_get_btr_cur(pcur)->index;
+
+		if (prebuilt->new_rec_locks >= 2) {
+			/* Restore the cursor position and find the record
+			in the clustered index. */
+
+			if (!has_latches_on_recs) {
+				btr_pcur_restore_position(BTR_SEARCH_LEAF,
+							  clust_pcur, &mtr);
+			}
+
+			rec = btr_pcur_get_rec(clust_pcur);
+			index = btr_pcur_get_btr_cur(clust_pcur)->index;
+		}
+
+		if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+			/* This is not a clustered index record.  We
+			do not know how to unlock the record. */
+			goto no_unlock;
+		}
+
+		/* If the record has been modified by this
+		transaction, do not unlock it. */
+
+		if (index->trx_id_offset) {
+			rec_trx_id = trx_read_trx_id(rec
+						     + index->trx_id_offset);
+		} else {
+			mem_heap_t*	heap			= NULL;
+			ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+			ulint*	offsets				= offsets_;
+
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		if (ut_dulint_cmp(rec_trx_id, trx->id) != 0) {
+			/* We did not update the record: unlock it */
+
+			rec = btr_pcur_get_rec(pcur);
+			index = btr_pcur_get_btr_cur(pcur)->index;
+
+			lock_rec_unlock(trx, btr_pcur_get_block(pcur),
+					rec, prebuilt->select_lock_type);
+
+			if (prebuilt->new_rec_locks >= 2) {
+				rec = btr_pcur_get_rec(clust_pcur);
+				index = btr_pcur_get_btr_cur(clust_pcur)->index;
+
+				lock_rec_unlock(trx,
+						btr_pcur_get_block(clust_pcur),
+						rec,
+						prebuilt->select_lock_type);
+			}
+		}
+no_unlock:
+		mtr_commit(&mtr);
+	}
+
+	trx->op_info = "";
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	upd_node_t*	node,	/*!< in: update node used in the cascade
+				or set null operation */
+	dict_table_t*	table)	/*!< in: table where we do the operation */
+{
+	ulint	err;
+	trx_t*	trx;
+
+	trx = thr_get_trx(thr);
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	/* Note that the cascade node is a subnode of another InnoDB
+	query graph node. We do a normal lock wait in this node, but
+	all errors are handled by the parent node. */
+
+	if (err == DB_LOCK_WAIT) {
+		/* Handle lock wait here */
+
+		que_thr_stop_for_mysql(thr);
+
+		srv_suspend_mysql_thread(thr);
+
+		/* Note that a lock wait may also end in a lock wait timeout,
+		or this transaction is picked as a victim in selective
+		deadlock resolution */
+
+		if (trx->error_state != DB_SUCCESS) {
+
+			return(trx->error_state);
+		}
+
+		/* Retry operation after a normal lock wait */
+
+		goto run_again;
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (node->is_delete) {
+		if (table->stat_n_rows > 0) {
+			table->stat_n_rows--;
+		}
+
+		srv_n_rows_deleted++;
+	} else {
+		srv_n_rows_updated++;
+	}
+
+	row_update_statistics_if_needed(table);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return	TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	const dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(table);
+
+	return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS);
+}
+
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index. We have
+to take into account if we generated a default clustered index for the table
+@return	the key number used inside MySQL */
+UNIV_INTERN
+ulint
+row_get_mysql_key_number_for_index(
+/*===============================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	const dict_index_t*	ind;
+	ulint			i;
+
+	ut_a(index);
+
+	i = 0;
+	ind = dict_table_get_first_index(index->table);
+
+	while (index != ind) {
+		ind = dict_table_get_next_index(ind);
+		i++;
+	}
+
+	if (row_table_got_default_clust_index(index->table)) {
+		ut_a(i > 0);
+		i--;
+	}
+
+	return(i);
+}
+
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line)	/*!< in: line number */
+{
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	rw_lock_s_lock_func(&dict_operation_lock, 0, file, line);
+
+	trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+	rw_lock_s_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line)	/*!< in: line number */
+{
+	ut_a(trx->dict_operation_lock_mode == 0
+	     || trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks or lock waits can occur then in these operations */
+
+	rw_lock_x_lock_func(&dict_operation_lock, 0, file, line);
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_exit(&(dict_sys->mutex));
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate().
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed) */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	const char*	table_name;
+	ulint		table_name_len;
+	ulint		err;
+	ulint		i;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+err_exit:
+		dict_mem_table_free(table);
+		trx_commit_for_mysql(trx);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "creating table";
+
+	if (row_mysql_is_system_table(table->name)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to create a MySQL system"
+			" table %s of type InnoDB.\n"
+			"InnoDB: MySQL system tables must be"
+			" of the MyISAM type!\n",
+			table->name);
+		goto err_exit;
+	}
+
+	/* Check that no reserved column names are used. */
+	for (i = 0; i < dict_table_get_n_user_cols(table); i++) {
+		if (dict_col_name_is_reserved(
+			    dict_table_get_col_name(table, i))) {
+
+			goto err_exit;
+		}
+	}
+
+	trx_start_if_not_started(trx);
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = strchr(table->name, '/');
+	ut_a(table_name);
+	table_name++;
+	table_name_len = strlen(table_name) + 1;
+
+	if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) {
+
+		/* Table equals "innodb_monitor":
+		start monitor prints */
+
+		srv_print_innodb_monitor = TRUE;
+
+		/* The lock timeout monitor thread also takes care
+		of InnoDB monitor prints */
+
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_lock_monitor)) {
+
+		srv_print_innodb_monitor = TRUE;
+		srv_print_innodb_lock_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_mem_validate)) {
+		/* We define here a debugging feature intended for
+		developers */
+
+		fputs("Validating InnoDB memory:\n"
+		      "to use this feature you must compile InnoDB with\n"
+		      "UNIV_MEM_DEBUG defined in univ.i and"
+		      " the server must be\n"
+		      "quiet because allocation from a mem heap"
+		      " is not protected\n"
+		      "by any semaphore.\n", stderr);
+#ifdef UNIV_MEM_DEBUG
+		ut_a(mem_validate());
+		fputs("Memory validated\n", stderr);
+#else /* UNIV_MEM_DEBUG */
+		fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
+		      stderr);
+#endif /* UNIV_MEM_DEBUG */
+	}
+
+	heap = mem_heap_create(512);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	node = tab_create_graph_create(table, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+	}
+
+	switch (err) {
+	case DB_OUT_OF_FILE_SPACE:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: cannot create table ",
+		      stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" because tablespace full\n", stderr);
+
+		if (dict_table_get_low(table->name)) {
+
+			row_drop_table_for_mysql(table->name, trx, FALSE);
+			trx_commit_for_mysql(trx);
+		}
+		break;
+
+	case DB_DUPLICATE_KEY:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" already exists in InnoDB internal\n"
+		      "InnoDB: data dictionary. Have you deleted"
+		      " the .frm file\n"
+		      "InnoDB: and not used DROP TABLE?"
+		      " Have you used DROP DATABASE\n"
+		      "InnoDB: for InnoDB tables in"
+		      " MySQL version <= 3.23.43?\n"
+		      "InnoDB: See the Restrictions section"
+		      " of the InnoDB manual.\n"
+		      "InnoDB: You can drop the orphaned table"
+		      " inside InnoDB by\n"
+		      "InnoDB: creating an InnoDB table with"
+		      " the same name in another\n"
+		      "InnoDB: database and copying the .frm file"
+		      " to the current database.\n"
+		      "InnoDB: Then MySQL thinks the table exists,"
+		      " and DROP TABLE will\n"
+		      "InnoDB: succeed.\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+
+		/* We may also get err == DB_ERROR if the .ibd file for the
+		table already exists */
+
+		break;
+	}
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return	error number or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+	ulint		i;
+	ulint		len;
+	char*		table_name;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "creating index";
+
+	/* Copy the table name because we may want to drop the
+	table later, after the index object is freed (inside
+	que_run_threads()) and thus index->table_name is not available. */
+	table_name = mem_strdup(index->table_name);
+
+	trx_start_if_not_started(trx);
+
+	/* Check that the same column does not appear twice in the index.
+	Starting from 4.0.14, InnoDB should be able to cope with that, but
+	safer not to allow them. */
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		ulint		j;
+
+		for (j = 0; j < i; j++) {
+			if (0 == ut_strcmp(
+				    dict_index_get_nth_field(index, j)->name,
+				    dict_index_get_nth_field(index, i)->name)) {
+				ut_print_timestamp(stderr);
+
+				fputs("  InnoDB: Error: column ", stderr);
+				ut_print_name(stderr, trx, FALSE,
+					      dict_index_get_nth_field(
+						      index, i)->name);
+				fputs(" appears twice in ", stderr);
+				dict_index_name_print(stderr, trx, index);
+				fputs("\n"
+				      "InnoDB: This is not allowed"
+				      " in InnoDB.\n", stderr);
+
+				err = DB_COL_APPEARS_TWICE_IN_INDEX;
+
+				goto error_handling;
+			}
+		}
+
+		/* Check also that prefix_len and actual length
+		< DICT_MAX_INDEX_COL_LEN */
+
+		len = dict_index_get_nth_field(index, i)->prefix_len;
+
+		if (field_lengths) {
+			len = ut_max(len, field_lengths[i]);
+		}
+
+		if (len >= DICT_MAX_INDEX_COL_LEN) {
+			err = DB_TOO_BIG_RECORD;
+
+			goto error_handling;
+		}
+	}
+
+	heap = mem_heap_create(512);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	/* Note that the space id where we store the index is inherited from
+	the table in dict_build_index_def_step() in dict0crea.c. */
+
+	node = ind_create_graph_create(index, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+error_handling:
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		row_drop_table_for_mysql(table_name, trx, FALSE);
+
+		trx_commit_for_mysql(trx);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	mem_free(table_name);
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint. Check also that foreign key
+constraints which reference this table are ok.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+					table2 can be written also with the
+					database name before it: test.table2 */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+{
+	ulint	err;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_a(sql_string);
+
+	trx->op_info = "adding foreign keys";
+
+	trx_start_if_not_started(trx);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	err = dict_create_foreign_constraints(trx, sql_string, name,
+					      reject_fks);
+	if (err == DB_SUCCESS) {
+		/* Check that also referencing constraints are ok */
+		err = dict_load_foreigns(name, TRUE);
+	}
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		row_drop_table_for_mysql(name, trx, FALSE);
+
+		trx_commit_for_mysql(trx);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.c.
+@return	error code or DB_SUCCESS */
+static
+int
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+	const char*	name)	/*!< in: table name */
+{
+	ulint	error;
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	/* If the original transaction was dropping a table referenced by
+	foreign keys, we must set the following to be able to drop the
+	table: */
+
+	trx->check_foreigns = FALSE;
+
+	/*	fputs("InnoDB: Error: Dropping table ", stderr);
+	ut_print_name(stderr, trx, TRUE, name);
+	fputs(" in background drop list\n", stderr); */
+
+	/* Try to drop the table in InnoDB */
+
+	error = row_drop_table_for_mysql(name, trx, FALSE);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	return((int) error);
+}
+
+/*********************************************************************//**
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return	how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+{
+	row_mysql_drop_t*	drop;
+	dict_table_t*		table;
+	ulint			n_tables;
+	ulint			n_tables_dropped = 0;
+loop:
+	mutex_enter(&kernel_mutex);
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+	mutex_exit(&kernel_mutex);
+
+	if (drop == NULL) {
+		/* All tables dropped */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+	table = dict_table_get_low(drop->table_name);
+	mutex_exit(&(dict_sys->mutex));
+
+	if (table == NULL) {
+		/* If for some reason the table has already been dropped
+		through some other mechanism, do not try to drop it */
+
+		goto already_dropped;
+	}
+
+	if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
+		    drop->table_name)) {
+		/* If the DROP fails for some table, we return, and let the
+		main thread retry later */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	n_tables_dropped++;
+
+already_dropped:
+	mutex_enter(&kernel_mutex);
+
+	UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: Dropped table ", stderr);
+	ut_print_name(stderr, NULL, TRUE, drop->table_name);
+	fputs(" in background drop queue.\n", stderr);
+
+	mem_free(drop->table_name);
+
+	mem_free(drop);
+
+	mutex_exit(&kernel_mutex);
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return	how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	return(UT_LIST_GET_LEN(row_mysql_drop_list));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return	TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+	const char*	name)	/*!< in: table name */
+{
+	row_mysql_drop_t*	drop;
+
+	mutex_enter(&kernel_mutex);
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	/* Look if the table already is in the drop list */
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+	while (drop != NULL) {
+		if (strcmp(drop->table_name, name) == 0) {
+			/* Already in the list */
+
+			mutex_exit(&kernel_mutex);
+
+			return(FALSE);
+		}
+
+		drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
+	}
+
+	drop = mem_alloc(sizeof(row_mysql_drop_t));
+
+	drop->table_name = mem_strdup(name);
+
+	UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+	/*	fputs("InnoDB: Adding table ", stderr);
+	ut_print_name(stderr, trx, TRUE, drop->table_name);
+	fputs(" to background drop list\n", stderr); */
+
+	mutex_exit(&kernel_mutex);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_foreign_t*	foreign;
+	dulint		new_id;
+	dict_table_t*	table;
+	ibool		success;
+	ulint		err;
+	pars_info_t*	info = NULL;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages.
+
+	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+	MySQL table lock on the table before we can do DISCARD
+	TABLESPACE. Then there are no running queries on the table.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: we remove all entries for the tablespace in
+	the insert buffer tree; as long as the tablespace mem object
+	does not exist, ongoing insert buffer page merges are
+	discarded in buf0rea.c. If we recreate the tablespace mem
+	object with IMPORT TABLESPACE later, then the tablespace will
+	have the same id, but the tablespace_version field in the mem
+	object is different, and ongoing old insert buffer page merges
+	get discarded.
+
+	4) Linear readahead and random readahead: we use the same
+	method as in 3) to discard ongoing operations.
+
+	5) FOREIGN KEY operations: if
+	table->n_foreign_key_checks_running > 0, we do not allow the
+	discard. We also reserve the data dictionary latch. */
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "discarding tablespace";
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	if (table->space == 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: is in the system tablespace 0"
+		      " which cannot be discarded\n", stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: You are trying to DISCARD table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs("\n"
+		      "InnoDB: though there is a foreign key check"
+		      " running on it.\n"
+		      "InnoDB: Cannot discard the table.\n",
+		      stderr);
+
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns) {
+
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow discarding a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		err = DB_CANNOT_DROP_CONSTRAINT;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot DISCARD table ", ef);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "because it is referenced by ", ef);
+		ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		goto funct_exit;
+	}
+
+	new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+	/* Remove all locks except the table-level S and X locks. */
+	lock_remove_all_on_table(table, FALSE);
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "table_name", name);
+	pars_info_add_dulint_literal(info, "new_id", new_id);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
+			   "old_id CHAR;\n"
+			   "BEGIN\n"
+			   "SELECT ID INTO old_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       COMMIT WORK;\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
+			   "UPDATE SYS_TABLES SET ID = :new_id\n"
+			   " WHERE ID = old_id;\n"
+			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = old_id;\n"
+			   "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = old_id;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		dict_table_change_id_in_cache(table, new_id);
+
+		success = fil_discard_tablespace(table->space);
+
+		if (!success) {
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, FALSE, NULL);
+			trx->error_state = DB_SUCCESS;
+
+			err = DB_ERROR;
+		} else {
+			/* Set the flag which tells that now it is legal to
+			IMPORT a tablespace for this table */
+			table->tablespace_discarded = TRUE;
+			table->ibd_file_missing = TRUE;
+		}
+	}
+
+funct_exit:
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_table_t*	table;
+	ibool		success;
+	ib_uint64_t	current_lsn;
+	ulint		err		= DB_SUCCESS;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx_start_if_not_started(trx);
+
+	trx->op_info = "importing tablespace";
+
+	current_lsn = log_get_lsn();
+
+	/* It is possible, though very improbable, that the lsn's in the
+	tablespace to be imported have risen above the current system lsn, if
+	a lengthy purge, ibuf merge, or rollback was performed on a backup
+	taken with ibbackup. If that is the case, reset page lsn's in the
+	file. We assume that mysqld was shut down after it performed these
+	cleanup operations on the .ibd file, so that it stamped the latest lsn
+	to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+
+	TODO: reset also the trx id's in clustered index records and write
+	a new space id to each data page. That would allow us to import clean
+	.ibd files from another MySQL installation. */
+
+	success = fil_reset_too_high_lsns(name, current_lsn);
+
+	if (!success) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: cannot reset lsn's in table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+		      stderr);
+
+		err = DB_ERROR;
+
+		row_mysql_lock_data_dictionary(trx);
+
+		goto funct_exit;
+	}
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: does not exist in the InnoDB data dictionary\n"
+		      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+		      stderr);
+
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	if (table->space == 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: is in the system tablespace 0"
+		      " which cannot be imported\n", stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	if (!table->tablespace_discarded) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: you are trying to"
+		      " IMPORT a tablespace\n"
+		      "InnoDB: ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs(", though you have not called DISCARD on it yet\n"
+		      "InnoDB: during the lifetime of the mysqld process!\n",
+		      stderr);
+
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Play safe and remove all insert buffer entries, though we should
+	have removed them already when DISCARD TABLESPACE was called */
+
+	ibuf_delete_for_discarded_space(table->space);
+
+	success = fil_open_single_table_tablespace(
+		TRUE, table->space,
+		table->flags == DICT_TF_COMPACT ? 0 : table->flags,
+		table->name);
+	if (success) {
+		table->ibd_file_missing = FALSE;
+		table->tablespace_discarded = FALSE;
+	} else {
+		if (table->ibd_file_missing) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: cannot find or open in the"
+			      " database directory the .ibd file of\n"
+			      "InnoDB: table ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs("\n"
+			      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+			      stderr);
+		}
+
+		err = DB_ERROR;
+	}
+
+funct_exit:
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_foreign_t*	foreign;
+	ulint		err;
+	mem_heap_t*	heap;
+	byte*		buf;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	dulint		new_id;
+	ulint		recreate_space = 0;
+	pars_info_t*	info = NULL;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages.
+
+	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+	MySQL table lock on the table before we can do TRUNCATE
+	TABLE. Then there are no running queries on the table. This is
+	guaranteed, because in ha_innobase::store_lock(), we do not
+	weaken the TL_WRITE lock requested by MySQL when executing
+	SQLCOM_TRUNCATE.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE,
+	so we do not have to remove insert buffer records, as the
+	insert buffer works at a low level. If a freed page is later
+	reallocated, the allocator will remove the ibuf entries for
+	it.
+
+	When we truncate *.ibd files by recreating them (analogous to
+	DISCARD TABLESPACE), we remove all entries for the table in the
+	insert buffer tree.  This is not strictly necessary, because
+	in 6) we will assign a new tablespace identifier, but we can
+	free up some space in the system tablespace.
+
+	4) Linear readahead and random readahead: we use the same
+	method as in 3) to discard ongoing operations. (This is only
+	relevant for TRUNCATE TABLE by DISCARD TABLESPACE.)
+
+	5) FOREIGN KEY operations: if
+	table->n_foreign_key_checks_running > 0, we do not allow the
+	TRUNCATE. We also reserve the data dictionary latch.
+
+	6) Crash recovery: To prevent the application of pre-truncation
+	redo log records on the truncated tablespace, we will assign
+	a new tablespace identifier to the truncated tablespace. */
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(table);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "truncating table";
+
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+	/* Prevent foreign key checks etc. while we are truncating the
+	table */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns) {
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow truncating a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot truncate table ", ef);
+		ut_print_name(ef, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because it is referenced by ", ef);
+		ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		err = DB_ERROR;
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	they can cope with the table having been truncated here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Cannot truncate table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because there is a foreign key check"
+		      " running on it.\n",
+		      stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Remove all locks except the table-level S and X locks. */
+	lock_remove_all_on_table(table, FALSE);
+
+	trx->table_id = table->id;
+
+	if (table->space && !table->dir_path_of_temp_table) {
+		/* Discard and create the single-table tablespace. */
+		ulint	space	= table->space;
+		ulint	flags	= fil_space_get_flags(space);
+
+		if (flags != ULINT_UNDEFINED
+		    && fil_discard_tablespace(space)) {
+
+			dict_index_t*	index;
+
+			space = 0;
+
+			if (fil_create_new_single_table_tablespace(
+				    &space, table->name, FALSE, flags,
+				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: TRUNCATE TABLE %s failed to"
+					" create a new tablespace\n",
+					table->name);
+				table->ibd_file_missing = 1;
+				err = DB_ERROR;
+				goto funct_exit;
+			}
+
+			recreate_space = space;
+
+			/* Replace the space_id in the data dictionary cache.
+			The persisent data dictionary (SYS_TABLES.SPACE
+			and SYS_INDEXES.SPACE) are updated later in this
+			function. */
+			table->space = space;
+			index = dict_table_get_first_index(table);
+			do {
+				index->space = space;
+				index = dict_table_get_next_index(index);
+			} while (index);
+
+			mtr_start(&mtr);
+			fsp_header_init(space,
+					FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+			mtr_commit(&mtr);
+		}
+	}
+
+	/* scan SYS_INDEXES for all indexes of the table */
+	heap = mem_heap_create(800);
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	sys_index = dict_table_get_first_index(dict_sys->sys_indexes);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	mtr_start(&mtr);
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_MODIFY_LEAF, &pcur, &mtr);
+	for (;;) {
+		rec_t*		rec;
+		const byte*	field;
+		ulint		len;
+		ulint		root_page_no;
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			/* The end of SYS_INDEXES has been reached. */
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_ad(len == 8);
+
+		if (memcmp(buf, field, len) != 0) {
+			/* End of indexes for the table (TABLE_ID mismatch). */
+			break;
+		}
+
+		if (rec_get_deleted_flag(rec, FALSE)) {
+			/* The index has been dropped. */
+			goto next_rec;
+		}
+
+		/* This call may commit and restart mtr
+		and reposition pcur. */
+		root_page_no = dict_truncate_index_tree(table, recreate_space,
+							&pcur, &mtr);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (root_page_no != FIL_NULL) {
+			page_rec_write_index_page_no(
+				rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				root_page_no, &mtr);
+			/* We will need to commit and restart the
+			mini-transaction in order to avoid deadlocks.
+			The dict_truncate_index_tree() call has allocated
+			a page in this mini-transaction, and the rest of
+			this loop could latch another index page. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						  &pcur, &mtr);
+		}
+
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+
+	new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+	info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "space", (lint) table->space);
+	pars_info_add_dulint_literal(info, "old_id", table->id);
+	pars_info_add_dulint_literal(info, "new_id", new_id);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES"
+			   " SET ID = :new_id, SPACE = :space\n"
+			   " WHERE ID = :old_id;\n"
+			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = :old_id;\n"
+			   "UPDATE SYS_INDEXES"
+			   " SET TABLE_ID = :new_id, SPACE = :space\n"
+			   " WHERE TABLE_ID = :old_id;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Unable to assign a new identifier to table ",
+		      stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs("\n"
+		      "InnoDB: after truncating it.  Background processes"
+		      " may corrupt the table!\n", stderr);
+		err = DB_ERROR;
+	} else {
+		dict_table_change_id_in_cache(table, new_id);
+	}
+
+	/* MySQL calls ha_innobase::reset_auto_increment() which does
+	the same thing. */
+	dict_table_autoinc_lock(table);
+	dict_table_autoinc_initialize(table, 1);
+	dict_table_autoinc_unlock(table);
+	dict_update_statistics(table);
+
+	trx_commit_for_mysql(trx);
+
+funct_exit:
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL.  If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread.  If the data dictionary was not already locked
+by the transaction, the transaction will be committed.  Otherwise, the
+data dictionary will remain locked.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_table_for_mysql(
+/*=====================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx,	/*!< in: transaction handle */
+	ibool		drop_db)/*!< in: TRUE=dropping whole database */
+{
+	dict_foreign_t*	foreign;
+	dict_table_t*	table;
+	ulint		space_id;
+	ulint		err;
+	const char*	table_name;
+	ulint		namelen;
+	ibool		locked_dictionary	= FALSE;
+	pars_info_t*    info			= NULL;
+
+	ut_a(name != NULL);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "dropping table";
+
+	trx_start_if_not_started(trx);
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = strchr(name, '/');
+	ut_a(table_name);
+	table_name++;
+	namelen = strlen(table_name) + 1;
+
+	if (namelen == sizeof S_innodb_monitor
+	    && !memcmp(table_name, S_innodb_monitor,
+		       sizeof S_innodb_monitor)) {
+
+		/* Table name equals "innodb_monitor":
+		stop monitor prints */
+
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_lock_monitor
+		   && !memcmp(table_name, S_innodb_lock_monitor,
+			      sizeof S_innodb_lock_monitor)) {
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_tablespace_monitor
+		   && !memcmp(table_name, S_innodb_tablespace_monitor,
+			      sizeof S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_table_monitor
+		   && !memcmp(table_name, S_innodb_table_monitor,
+			      sizeof S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = FALSE;
+	}
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+		/* Prevent foreign key checks etc. while we are dropping the
+		table */
+
+		row_mysql_lock_data_dictionary(trx);
+
+		locked_dictionary = TRUE;
+	}
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs(" does not exist in the InnoDB internal\n"
+		      "InnoDB: data dictionary though MySQL is"
+		      " trying to drop it.\n"
+		      "InnoDB: Have you copied the .frm file"
+		      " of the table to the\n"
+		      "InnoDB: MySQL database directory"
+		      " from another database?\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+check_next_foreign:
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns
+	    && !(drop_db && dict_tables_have_same_db(
+			 name, foreign->foreign_table_name))) {
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow dropping a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		err = DB_CANNOT_DROP_CONSTRAINT;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot drop table ", ef);
+		ut_print_name(ef, trx, TRUE, name);
+		fputs("\n"
+		      "because it is referenced by ", ef);
+		ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		goto funct_exit;
+	}
+
+	if (foreign && trx->check_foreigns) {
+		goto check_next_foreign;
+	}
+
+	if (table->n_mysql_handles_opened > 0) {
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table->name);
+
+		if (added) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: MySQL is"
+			      " trying to drop table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("\n"
+			      "InnoDB: though there are still"
+			      " open handles to it.\n"
+			      "InnoDB: Adding the table to the"
+			      " background drop queue.\n",
+			      stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	they can cope with the table having been dropped here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+		const char*	table_name = table->name;
+		ibool		added;
+
+		added = row_add_table_to_background_drop_list(table_name);
+
+		if (added) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: You are trying to drop table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs("\n"
+			      "InnoDB: though there is a"
+			      " foreign key check running on it.\n"
+			      "InnoDB: Adding the table to"
+			      " the background drop queue.\n",
+			      stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* Remove all locks there are on the table or its records */
+	lock_remove_all_on_table(table, TRUE);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+	trx->table_id = table->id;
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "table_name", name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE DROP_TABLE_PROC () IS\n"
+			   "sys_foreign_id CHAR;\n"
+			   "table_id CHAR;\n"
+			   "index_id CHAR;\n"
+			   "foreign_id CHAR;\n"
+			   "found INT;\n"
+			   "BEGIN\n"
+			   "SELECT ID INTO table_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
+			   "found := 1;\n"
+			   "SELECT ID INTO sys_foreign_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = 'SYS_FOREIGN'\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "IF (:table_name = 'SYS_FOREIGN') THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "       SELECT ID INTO foreign_id\n"
+			   "       FROM SYS_FOREIGN\n"
+			   "       WHERE FOR_NAME = :table_name\n"
+			   "               AND TO_BINARY(FOR_NAME)\n"
+			   "                 = TO_BINARY(:table_name)\n"
+			   "               LOCK IN SHARE MODE;\n"
+			   "       IF (SQL % NOTFOUND) THEN\n"
+			   "               found := 0;\n"
+			   "       ELSE\n"
+			   "               DELETE FROM SYS_FOREIGN_COLS\n"
+			   "               WHERE ID = foreign_id;\n"
+			   "               DELETE FROM SYS_FOREIGN\n"
+			   "               WHERE ID = foreign_id;\n"
+			   "       END IF;\n"
+			   "END LOOP;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "       SELECT ID INTO index_id\n"
+			   "       FROM SYS_INDEXES\n"
+			   "       WHERE TABLE_ID = table_id\n"
+			   "       LOCK IN SHARE MODE;\n"
+			   "       IF (SQL % NOTFOUND) THEN\n"
+			   "               found := 0;\n"
+			   "       ELSE\n"
+			   "               DELETE FROM SYS_FIELDS\n"
+			   "               WHERE INDEX_ID = index_id;\n"
+			   "               DELETE FROM SYS_INDEXES\n"
+			   "               WHERE ID = index_id\n"
+			   "               AND TABLE_ID = table_id;\n"
+			   "       END IF;\n"
+			   "END LOOP;\n"
+			   "DELETE FROM SYS_COLUMNS\n"
+			   "WHERE TABLE_ID = table_id;\n"
+			   "DELETE FROM SYS_TABLES\n"
+			   "WHERE ID = table_id;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		ut_a(err == DB_OUT_OF_FILE_SPACE);
+
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+
+		row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+		ut_error;
+	} else {
+		ibool		is_path;
+		const char*	name_or_path;
+		mem_heap_t*	heap;
+
+		heap = mem_heap_create(200);
+
+		/* Clone the name, in case it has been allocated
+		from table->heap, which will be freed by
+		dict_table_remove_from_cache(table) below. */
+		name = mem_heap_strdup(heap, name);
+		space_id = table->space;
+
+		if (table->dir_path_of_temp_table != NULL) {
+			is_path = TRUE;
+			name_or_path = mem_heap_strdup(
+				heap, table->dir_path_of_temp_table);
+		} else {
+			is_path = FALSE;
+			name_or_path = name;
+		}
+
+		dict_table_remove_from_cache(table);
+
+		if (dict_load_table(name) != NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: not able to remove table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs(" from the dictionary cache!\n", stderr);
+			err = DB_ERROR;
+		}
+
+		/* Do not drop possible .ibd tablespace if something went
+		wrong: we do not want to delete valuable data of the user */
+
+		if (err == DB_SUCCESS && space_id > 0) {
+			if (!fil_space_for_table_exists_in_mem(space_id,
+							       name_or_path,
+							       is_path,
+							       FALSE, TRUE)) {
+				err = DB_SUCCESS;
+
+				fprintf(stderr,
+					"InnoDB: We removed now the InnoDB"
+					" internal data dictionary entry\n"
+					"InnoDB: of table ");
+				ut_print_name(stderr, trx, TRUE, name);
+				fprintf(stderr, ".\n");
+			} else if (!fil_delete_tablespace(space_id)) {
+				fprintf(stderr,
+					"InnoDB: We removed now the InnoDB"
+					" internal data dictionary entry\n"
+					"InnoDB: of table ");
+				ut_print_name(stderr, trx, TRUE, name);
+				fprintf(stderr, ".\n");
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Error: not able to"
+					" delete tablespace %lu of table ",
+					(ulong) space_id);
+				ut_print_name(stderr, trx, TRUE, name);
+				fputs("!\n", stderr);
+				err = DB_ERROR;
+			}
+		}
+
+		mem_heap_free(heap);
+	}
+funct_exit:
+
+	if (locked_dictionary) {
+		trx_commit_for_mysql(trx);
+
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	return((int) err);
+}
+
+/*******************************************************************//**
+Drop all foreign keys in a database, see Bug#18942.
+Called at the end of row_drop_database_for_mysql().
+@return	error code or DB_SUCCESS */
+static
+ulint
+drop_all_foreign_keys_in_db(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	pars_info_t*	pinfo;
+	ulint		err;
+
+	ut_a(name[strlen(name) - 1] == '/');
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "dbname", name);
+
+/** true if for_name is not prefixed with dbname */
+#define TABLE_NOT_IN_THIS_DB \
+"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname"
+
+	err = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n"
+			   "foreign_id CHAR;\n"
+			   "for_name CHAR;\n"
+			   "found INT;\n"
+			   "DECLARE CURSOR cur IS\n"
+			   "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n"
+			   "WHERE FOR_NAME >= :dbname\n"
+			   "LOCK IN SHARE MODE\n"
+			   "ORDER BY FOR_NAME;\n"
+			   "BEGIN\n"
+			   "found := 1;\n"
+			   "OPEN cur;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "        FETCH cur INTO foreign_id, for_name;\n"
+			   "        IF (SQL % NOTFOUND) THEN\n"
+			   "                found := 0;\n"
+			   "        ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n"
+			   "                found := 0;\n"
+			   "        ELSIF (1=1) THEN\n"
+			   "                DELETE FROM SYS_FOREIGN_COLS\n"
+			   "                WHERE ID = foreign_id;\n"
+			   "                DELETE FROM SYS_FOREIGN\n"
+			   "                WHERE ID = foreign_id;\n"
+			   "        END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE cur;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n",
+			   FALSE, /* do not reserve dict mutex,
+				  we are already holding it */
+			   trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drops a database for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_database_for_mysql(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_table_t* table;
+	char*	table_name;
+	int	err	= DB_SUCCESS;
+	ulint	namelen	= strlen(name);
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(name != NULL);
+	ut_a(name[namelen - 1] == '/');
+
+	trx->op_info = "dropping database";
+
+	trx_start_if_not_started(trx);
+loop:
+	row_mysql_lock_data_dictionary(trx);
+
+	while ((table_name = dict_get_first_table_name_in_db(name))) {
+		ut_a(memcmp(table_name, name, namelen) == 0);
+
+		table = dict_table_get_low(table_name);
+
+		ut_a(table);
+
+		/* Wait until MySQL does not have any queries running on
+		the table */
+
+		if (table->n_mysql_handles_opened > 0) {
+			row_mysql_unlock_data_dictionary(trx);
+
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: MySQL is trying to"
+			      " drop database ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs("\n"
+			      "InnoDB: though there are still"
+			      " open handles to table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs(".\n", stderr);
+
+			os_thread_sleep(1000000);
+
+			mem_free(table_name);
+
+			goto loop;
+		}
+
+		err = row_drop_table_for_mysql(table_name, trx, TRUE);
+		trx_commit_for_mysql(trx);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fprintf(stderr, " failed with error %lu for table ",
+				(ulint) err);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			putc('\n', stderr);
+			mem_free(table_name);
+			break;
+		}
+
+		mem_free(table_name);
+	}
+
+	if (err == DB_SUCCESS) {
+		/* after dropping all tables try to drop all leftover
+		foreign keys in case orphaned ones exist */
+		err = (int) drop_all_foreign_keys_in_db(name, trx);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fprintf(stderr, " failed with error %d while "
+				"dropping all foreign keys", err);
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return	TRUE if temporary table */
+static
+ibool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*	name)	/*!< in: table name in the form
+				'database/tablename' */
+{
+	return(strstr(name, "/#sql") != NULL);
+	/* return(strstr(name, "/@0023sql") != NULL); */
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return	error code or DB_SUCCESS */
+static
+int
+row_delete_constraint_low(
+/*======================*/
+	const char*	id,		/*!< in: constraint id */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", id);
+
+	return((int) que_eval_sql(info,
+			    "PROCEDURE DELETE_CONSTRAINT () IS\n"
+			    "BEGIN\n"
+			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+			    "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+			    "END;\n"
+			    , FALSE, trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return	error code or DB_SUCCESS */
+static
+int
+row_delete_constraint(
+/*==================*/
+	const char*	id,		/*!< in: constraint id */
+	const char*	database_name,	/*!< in: database name, with the
+					trailing '/' */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	ulint		err;
+
+	/* New format constraints have ids <databasename>/<constraintname>. */
+	err = row_delete_constraint_low(
+		mem_heap_strcat(heap, database_name, id), trx);
+
+	if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+		/* Old format < 4.0.18 constraints have constraint ids
+		<number>_<number>. We only try deleting them if the
+		constraint name does not contain a '/' character, otherwise
+		deleting a new format constraint named 'foo/bar' from
+		database 'baz' would remove constraint 'bar' from database
+		'foo', if it existed. */
+
+		err = row_delete_constraint_low(id, trx);
+	}
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in: transaction handle */
+	ibool		commit)		/*!< in: if TRUE then commit trx */
+{
+	dict_table_t*	table;
+	ulint		err			= DB_ERROR;
+	mem_heap_t*	heap			= NULL;
+	const char**	constraints_to_drop	= NULL;
+	ulint		n_constraints_to_drop	= 0;
+	ibool		old_is_tmp, new_is_tmp;
+	pars_info_t*	info			= NULL;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that newraw"
+		      " is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+
+		goto funct_exit;
+	} else if (row_mysql_is_system_table(new_name)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to create a MySQL"
+			" system table %s of type InnoDB.\n"
+			"InnoDB: MySQL system tables must be"
+			" of the MyISAM type!\n",
+			new_name);
+
+		goto funct_exit;
+	}
+
+	trx->op_info = "renaming table";
+	trx_start_if_not_started(trx);
+
+	old_is_tmp = row_is_mysql_tmp_table_name(old_name);
+	new_is_tmp = row_is_mysql_tmp_table_name(new_name);
+
+	table = dict_table_get_low(old_name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, old_name);
+		fputs(" does not exist in the InnoDB internal\n"
+		      "InnoDB: data dictionary though MySQL is"
+		      " trying to rename the table.\n"
+		      "InnoDB: Have you copied the .frm file"
+		      " of the table to the\n"
+		      "InnoDB: MySQL database directory"
+		      " from another database?\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	} else if (table->ibd_file_missing) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, old_name);
+		fputs(" does not have an .ibd file"
+		      " in the database directory.\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	} else if (new_is_tmp) {
+		/* MySQL is doing an ALTER TABLE command and it renames the
+		original table to a temporary table name. We want to preserve
+		the original foreign key constraint definitions despite the
+		name change. An exception is those constraints for which
+		the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+		heap = mem_heap_create(100);
+
+		err = dict_foreign_parse_drop_constraints(
+			heap, trx, table, &n_constraints_to_drop,
+			&constraints_to_drop);
+
+		if (err != DB_SUCCESS) {
+
+			goto funct_exit;
+		}
+	}
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data from system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_table_name", new_name);
+	pars_info_add_str_literal(info, "old_table_name", old_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLE () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES SET NAME = :new_table_name\n"
+			   " WHERE NAME = :old_table_name;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+
+		goto end;
+	} else if (!new_is_tmp) {
+		/* Rename all constraints. */
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "old_table_name", old_name);
+
+		err = que_eval_sql(
+			info,
+			"PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+			"gen_constr_prefix CHAR;\n"
+			"new_db_name CHAR;\n"
+			"foreign_id CHAR;\n"
+			"new_foreign_id CHAR;\n"
+			"old_db_name_len INT;\n"
+			"old_t_name_len INT;\n"
+			"new_db_name_len INT;\n"
+			"id_len INT;\n"
+			"found INT;\n"
+			"BEGIN\n"
+			"found := 1;\n"
+			"old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+			"new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+			"new_db_name := SUBSTR(:new_table_name, 0,\n"
+			"                      new_db_name_len);\n"
+			"old_t_name_len := LENGTH(:old_table_name);\n"
+			"gen_constr_prefix := CONCAT(:old_table_name,\n"
+			"                            '_ibfk_');\n"
+			"WHILE found = 1 LOOP\n"
+			"       SELECT ID INTO foreign_id\n"
+			"        FROM SYS_FOREIGN\n"
+			"        WHERE FOR_NAME = :old_table_name\n"
+			"         AND TO_BINARY(FOR_NAME)\n"
+			"           = TO_BINARY(:old_table_name)\n"
+			"         LOCK IN SHARE MODE;\n"
+			"       IF (SQL % NOTFOUND) THEN\n"
+			"        found := 0;\n"
+			"       ELSE\n"
+			"        UPDATE SYS_FOREIGN\n"
+			"        SET FOR_NAME = :new_table_name\n"
+			"         WHERE ID = foreign_id;\n"
+			"        id_len := LENGTH(foreign_id);\n"
+			"        IF (INSTR(foreign_id, '/') > 0) THEN\n"
+			"               IF (INSTR(foreign_id,\n"
+			"                         gen_constr_prefix) > 0)\n"
+			"               THEN\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(:new_table_name,\n"
+			"                SUBSTR(foreign_id, old_t_name_len,\n"
+			"                       id_len - old_t_name_len));\n"
+			"               ELSE\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(new_db_name,\n"
+			"                SUBSTR(foreign_id,\n"
+			"                       old_db_name_len,\n"
+			"                       id_len - old_db_name_len));\n"
+			"               END IF;\n"
+			"               UPDATE SYS_FOREIGN\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"               UPDATE SYS_FOREIGN_COLS\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"        END IF;\n"
+			"       END IF;\n"
+			"END LOOP;\n"
+			"UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+			"WHERE REF_NAME = :old_table_name\n"
+			"  AND TO_BINARY(REF_NAME)\n"
+			"    = TO_BINARY(:old_table_name);\n"
+			"END;\n"
+			, FALSE, trx);
+
+	} else if (n_constraints_to_drop > 0) {
+		/* Drop some constraints of tmp tables. */
+
+		ulint	db_name_len = dict_get_db_name_len(old_name) + 1;
+		char*	db_name = mem_heap_strdupl(heap, old_name,
+						   db_name_len);
+		ulint	i;
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			err = row_delete_constraint(constraints_to_drop[i],
+						    db_name, heap, trx);
+
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+end:
+	if (err != DB_SUCCESS) {
+		if (err == DB_DUPLICATE_KEY) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error; possible reasons:\n"
+			      "InnoDB: 1) Table rename would cause"
+			      " two FOREIGN KEY constraints\n"
+			      "InnoDB: to have the same internal name"
+			      " in case-insensitive comparison.\n"
+			      "InnoDB: 2) table ", stderr);
+			ut_print_name(stderr, trx, TRUE, new_name);
+			fputs(" exists in the InnoDB internal data\n"
+			      "InnoDB: dictionary though MySQL is"
+			      " trying to rename table ", stderr);
+			ut_print_name(stderr, trx, TRUE, old_name);
+			fputs(" to it.\n"
+			      "InnoDB: Have you deleted the .frm file"
+			      " and not used DROP TABLE?\n"
+			      "InnoDB: You can look for further help from\n"
+			      "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			      "InnoDB: If table ", stderr);
+			ut_print_name(stderr, trx, TRUE, new_name);
+			fputs(" is a temporary table #sql..., then"
+			      " it can be that\n"
+			      "InnoDB: there are still queries running"
+			      " on the table, and it will be\n"
+			      "InnoDB: dropped automatically when"
+			      " the queries end.\n"
+			      "InnoDB: You can drop the orphaned table"
+			      " inside InnoDB by\n"
+			      "InnoDB: creating an InnoDB table with"
+			      " the same name in another\n"
+			      "InnoDB: database and copying the .frm file"
+			      " to the current database.\n"
+			      "InnoDB: Then MySQL thinks the table exists,"
+			      " and DROP TABLE will\n"
+			      "InnoDB: succeed.\n", stderr);
+		}
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		/* The following call will also rename the .ibd data file if
+		the table is stored in a single-table tablespace */
+
+		if (!dict_table_rename_in_cache(table, new_name,
+						!new_is_tmp)) {
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, FALSE, NULL);
+			trx->error_state = DB_SUCCESS;
+			goto funct_exit;
+		}
+
+		/* We only want to switch off some of the type checking in
+		an ALTER, not in a RENAME. */
+
+		err = dict_load_foreigns(
+			new_name, !old_is_tmp || trx->check_foreigns);
+
+		if (err != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+
+			if (old_is_tmp) {
+				fputs("  InnoDB: Error: in ALTER TABLE ",
+				      stderr);
+				ut_print_name(stderr, trx, TRUE, new_name);
+				fputs("\n"
+				      "InnoDB: has or is referenced"
+				      " in foreign key constraints\n"
+				      "InnoDB: which are not compatible"
+				      " with the new table definition.\n",
+				      stderr);
+			} else {
+				fputs("  InnoDB: Error: in RENAME TABLE"
+				      " table ",
+				      stderr);
+				ut_print_name(stderr, trx, TRUE, new_name);
+				fputs("\n"
+				      "InnoDB: is referenced in"
+				      " foreign key constraints\n"
+				      "InnoDB: which are not compatible"
+				      " with the new table definition.\n",
+				      stderr);
+			}
+
+			ut_a(dict_table_rename_in_cache(table,
+							old_name, FALSE));
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, FALSE, NULL);
+			trx->error_state = DB_SUCCESS;
+		}
+	}
+
+funct_exit:
+
+	if (commit) {
+		trx_commit_for_mysql(trx);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return	TRUE if ok */
+static
+ibool
+row_scan_and_check_index(
+/*=====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in MySQL */
+	dict_index_t*	index,		/*!< in: index */
+	ulint*		n_rows)		/*!< out: number of entries seen in the
+					current consistent read */
+{
+	dtuple_t*	prev_entry	= NULL;
+	ulint		matched_fields;
+	ulint		matched_bytes;
+	byte*		buf;
+	ulint		ret;
+	rec_t*		rec;
+	ibool		is_ok		= TRUE;
+	int		cmp;
+	ibool		contains_null;
+	ulint		i;
+	ulint		cnt;
+	mem_heap_t*	heap		= NULL;
+	ulint		n_ext;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets;
+	rec_offs_init(offsets_);
+
+	*n_rows = 0;
+
+	if (!row_merge_is_index_usable(prebuilt->trx, index)) {
+		/* A newly created index may lack some delete-marked
+		records that may exist in the read view of
+		prebuilt->trx.  Thus, such indexes must not be
+		accessed by consistent read. */
+		return(is_ok);
+	}
+
+	buf = mem_alloc(UNIV_PAGE_SIZE);
+	heap = mem_heap_create(100);
+
+	/* Make a dummy template in prebuilt, which we will use
+	in scanning the index entries */
+
+	prebuilt->index = index;
+	/* row_merge_is_index_usable() was already checked above. */
+	prebuilt->index_usable = TRUE;
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+	prebuilt->n_template = 0;
+	prebuilt->need_to_access_clustered = FALSE;
+
+	dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	cnt = 1000;
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+	/* Check thd->killed every 1,000 scanned rows */
+	if (--cnt == 0) {
+		if (trx_is_interrupted(prebuilt->trx)) {
+			goto func_exit;
+		}
+		cnt = 1000;
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		break;
+	default:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: CHECK TABLE on ", stderr);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, " returned %lu\n", ret);
+		/* fall through (this error is ignored by CHECK TABLE) */
+	case DB_END_OF_INDEX:
+func_exit:
+		mem_free(buf);
+		mem_heap_free(heap);
+
+		return(is_ok);
+	}
+
+	*n_rows = *n_rows + 1;
+
+	/* row_search... returns the index record in buf, record origin offset
+	within buf stored in the first 4 bytes, because we have built a dummy
+	template */
+
+	rec = buf + mach_read_from_4(buf);
+
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (prev_entry != NULL) {
+		matched_fields = 0;
+		matched_bytes = 0;
+
+		cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+						&matched_fields,
+						&matched_bytes);
+		contains_null = FALSE;
+
+		/* In a unique secondary index we allow equal key values if
+		they contain SQL NULLs */
+
+		for (i = 0;
+		     i < dict_index_get_n_ordering_defined_by_user(index);
+		     i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+				    dtuple_get_nth_field(prev_entry, i))) {
+
+				contains_null = TRUE;
+			}
+		}
+
+		if (cmp > 0) {
+			fputs("InnoDB: index records in a wrong order in ",
+			      stderr);
+not_ok:
+			dict_index_name_print(stderr,
+					      prebuilt->trx, index);
+			fputs("\n"
+			      "InnoDB: prev record ", stderr);
+			dtuple_print(stderr, prev_entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			is_ok = FALSE;
+		} else if (dict_index_is_unique(index)
+			   && !contains_null
+			   && matched_fields
+			   >= dict_index_get_n_ordering_defined_by_user(
+				   index)) {
+
+			fputs("InnoDB: duplicate key in ", stderr);
+			goto not_ok;
+		}
+	}
+
+	{
+		mem_heap_t*	tmp_heap = NULL;
+
+		/* Empty the heap on each round.  But preserve offsets[]
+		for the row_rec_to_index_entry() call, by copying them
+		into a separate memory heap when needed. */
+		if (UNIV_UNLIKELY(offsets != offsets_)) {
+			ulint	size = rec_offs_get_n_alloc(offsets)
+				* sizeof *offsets;
+
+			tmp_heap = mem_heap_create(size);
+			offsets = mem_heap_dup(tmp_heap, offsets, size);
+		}
+
+		mem_heap_empty(heap);
+
+		prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec,
+						    index, offsets,
+						    &n_ext, heap);
+
+		if (UNIV_LIKELY_NULL(tmp_heap)) {
+			mem_heap_free(tmp_heap);
+		}
+	}
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Checks a table for corruption.
+@return	DB_ERROR or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_check_table_for_mysql(
+/*======================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table		= prebuilt->table;
+	dict_index_t*	index;
+	ulint		n_rows;
+	ulint		n_rows_in_table	= ULINT_UNDEFINED;
+	ulint		ret		= DB_SUCCESS;
+	ulint		old_isolation_level;
+
+	if (table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			table->name);
+		return(DB_ERROR);
+	}
+
+	prebuilt->trx->op_info = "checking table";
+
+	old_isolation_level = prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we use always
+	REPEATABLE READ here */
+
+	prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	/* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		/* fputs("Validating index ", stderr);
+		ut_print_name(stderr, trx, FALSE, index->name);
+		putc('\n', stderr); */
+
+		if (!btr_validate_index(index, prebuilt->trx)) {
+			ret = DB_ERROR;
+		} else {
+			if (!row_scan_and_check_index(prebuilt,index, &n_rows)){
+				ret = DB_ERROR;
+			}
+
+			if (trx_is_interrupted(prebuilt->trx)) {
+				break;
+			}
+
+			/* fprintf(stderr, "%lu entries in index %s\n", n_rows,
+			index->name); */
+
+			if (index == dict_table_get_first_index(table)) {
+				n_rows_in_table = n_rows;
+			} else if (n_rows != n_rows_in_table) {
+
+				ret = DB_ERROR;
+
+				fputs("Error: ", stderr);
+				dict_index_name_print(stderr,
+						      prebuilt->trx, index);
+				fprintf(stderr,
+					" contains %lu entries,"
+					" should be %lu\n",
+					(ulong) n_rows,
+					(ulong) n_rows_in_table);
+			}
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	/* Restore the original isolation level */
+	prebuilt->trx->isolation_level = old_isolation_level;
+
+	/* We validate also the whole adaptive hash index for all tables
+	at every CHECK TABLE */
+
+	if (!btr_search_validate()) {
+
+		ret = DB_ERROR;
+	}
+
+	/* Restore the fatal lock wait timeout after CHECK TABLE. */
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	prebuilt->trx->op_info = "";
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return	TRUE if monitor table */
+UNIV_INTERN
+ibool
+row_is_magic_monitor_table(
+/*=======================*/
+	const char*	table_name)	/*!< in: name of the table, in the
+					form database/table_name */
+{
+	const char*	name; /* table_name without database/ */
+	ulint		len;
+
+	name = strchr(table_name, '/');
+	ut_a(name != NULL);
+	name++;
+	len = strlen(name) + 1;
+
+	if (STR_EQ(name, len, S_innodb_monitor)
+	    || STR_EQ(name, len, S_innodb_lock_monitor)
+	    || STR_EQ(name, len, S_innodb_tablespace_monitor)
+	    || STR_EQ(name, len, S_innodb_table_monitor)
+	    || STR_EQ(name, len, S_innodb_mem_validate)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.c
new file mode 100644
index 00000000000..500ebe571ab
--- /dev/null
+++ b/storage/innobase/row/row0purge.c
@@ -0,0 +1,689 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.c
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return	own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	purge_node_t*	node;
+
+	ut_ad(parent && heap);
+
+	node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+	node->common.type = QUE_NODE_PURGE;
+	node->common.parent = parent;
+
+	node->heap = mem_heap_create(256);
+
+	return(node);
+}
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found.
+@return	TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+	ulint		mode,	/*!< in: latching mode */
+	purge_node_t*	node,	/*!< in: row purge node */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	found;
+
+	if (node->found_clust) {
+		found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+		return(found);
+	}
+
+	found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+				      node->ref, mtr);
+	node->found_clust = found;
+
+	if (found) {
+		btr_pcur_store_position(&(node->pcur), mtr);
+	}
+
+	return(found);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@return TRUE if success, or if not found, or if modified after the
+delete marking */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	mtr_start(&mtr);
+
+	success = row_purge_reposition_pcur(mode, node, &mtr);
+
+	if (!success) {
+		/* The record is already removed */
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(
+				       rec, index, rec_get_offsets(
+					       rec, index, offsets_,
+					       ULINT_UNDEFINED, &heap)))) {
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		/* Someone else has modified the record later: do not remove */
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+					   RB_NONE, &mtr);
+
+		if (err == DB_SUCCESS) {
+			success = TRUE;
+		} else if (err == DB_OUT_OF_FILE_SPACE) {
+			success = FALSE;
+		} else {
+			ut_error;
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	return(success);
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node)	/*!< in: row purge node */
+{
+	ibool	success;
+	ulint	n_tries	= 0;
+
+	/*	fputs("Purge: Removing clustered record\n", stderr); */
+
+	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible.
+@return	TRUE if success or if not found */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has = 0; /* remove warning */
+	ibool		found;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	if (!found) {
+		/* Not found.  This is a legitimate condition.  In a
+		rollback, InnoDB will remove secondary recs that would
+		be purged anyway.  Then the actual purge will not find
+		the secondary index record.  Also, the purge itself is
+		eager: if it comes to consider a secondary index
+		record, and notices it does not need to exist in the
+		index, it will remove it.  Then if/when the purge
+		comes to consider the secondary index record a second
+		time, it will not exist any more in the index. */
+
+		/* fputs("PURGE:........sec entry not found\n", stderr); */
+		/* dtuple_print(stderr, entry); */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+	mtr_start(&mtr_vers);
+
+	success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers);
+
+	if (success) {
+		old_has = row_vers_old_has_index_entry(
+			TRUE, btr_pcur_get_rec(&(node->pcur)),
+			&mtr_vers, index, entry);
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+	if (!success || !old_has) {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+						   RB_NONE, &mtr);
+			success = err == DB_SUCCESS;
+			ut_a(success || err == DB_OUT_OF_FILE_SPACE);
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(success);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+
+	/*	fputs("Purge: Removing secondary record\n", stderr); */
+
+	success = row_purge_remove_sec_if_poss_low(node, index, entry,
+						   BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_low(node, index, entry,
+						   BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node)	/*!< in: row purge node */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+
+	ut_ad(node);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		/* Build the index entry */
+		entry = row_build_index_entry(node->row, NULL, index, heap);
+		ut_a(entry);
+		row_purge_remove_sec_if_poss(node, index, entry);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	row_purge_remove_clust_if_poss(node);
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern(
+/*==========================*/
+	purge_node_t*	node)	/*!< in: row purge node */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ibool		is_insert;
+	ulint		rseg_id;
+	ulint		page_no;
+	ulint		offset;
+	ulint		i;
+	mtr_t		mtr;
+
+	ut_ad(node);
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+		goto skip_secondaries;
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field_binary(NULL, node->index,
+						     node->update)) {
+			/* Build the older version of the index entry */
+			entry = row_build_index_entry(node->row, NULL,
+						      index, heap);
+			ut_a(entry);
+			row_purge_remove_sec_if_poss(node, index, entry);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+skip_secondaries:
+	/* Free possible externally stored fields */
+	for (i = 0; i < upd_get_n_fields(node->update); i++) {
+
+		const upd_field_t*	ufield
+			= upd_get_nth_field(node->update, i);
+
+		if (dfield_is_ext(&ufield->new_val)) {
+			buf_block_t*	block;
+			ulint		internal_offset;
+			byte*		data_field;
+
+			/* We use the fact that new_val points to
+			node->undo_rec and get thus the offset of
+			dfield data inside the undo record. Then we
+			can calculate from node->roll_ptr the file
+			address of the new_val data */
+
+			internal_offset
+				= ((const byte*)
+				   dfield_get_data(&ufield->new_val))
+				- node->undo_rec;
+
+			ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+			trx_undo_decode_roll_ptr(node->roll_ptr,
+						 &is_insert, &rseg_id,
+						 &page_no, &offset);
+			mtr_start(&mtr);
+
+			/* We have to acquire an X-latch to the clustered
+			index tree */
+
+			index = dict_table_get_first_index(node->table);
+
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+			/* NOTE: we must also acquire an X-latch to the
+			root page of the tree. We will need it when we
+			free pages from the tree. If the tree is of height 1,
+			the tree X-latch does NOT protect the root page,
+			because it is also a leaf page. Since we will have a
+			latch on an undo log page, we would break the
+			latching order if we would only later latch the
+			root page of such a tree! */
+
+			btr_root_get(index, &mtr);
+
+			/* We assume in purge of externally stored fields
+			that the space id of the undo log record is 0! */
+
+			block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr);
+			buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+			data_field = buf_block_get_frame(block)
+				+ offset + internal_offset;
+
+			ut_a(dfield_get_len(&ufield->new_val)
+			     >= BTR_EXTERN_FIELD_REF_SIZE);
+			btr_free_externally_stored_field(
+				index,
+				data_field + dfield_get_len(&ufield->new_val)
+				- BTR_EXTERN_FIELD_REF_SIZE,
+				NULL, NULL, NULL, 0, RB_NONE, &mtr);
+			mtr_commit(&mtr);
+		}
+	}
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record.
+@return TRUE if purge operation required: NOTE that then the CALLER
+must unfreeze data dictionary! */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+	purge_node_t*	node,	/*!< in: row undo node */
+	ibool*		updated_extern,
+				/*!< out: TRUE if an externally stored field
+				was updated */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	trx_t*		trx;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+
+	ut_ad(node && thr);
+
+	trx = thr_get_trx(thr);
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+				    updated_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+
+		return(FALSE);
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+	node->table = NULL;
+
+	if (type == TRX_UNDO_UPD_EXIST_REC
+	    && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
+
+		/* Purge requires no changes to indexes: we may return */
+
+		return(FALSE);
+	}
+
+	/* Prevent DROP TABLE etc. from running when we are doing the purge
+	for this row */
+
+	row_mysql_freeze_data_dictionary(trx);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	node->table = dict_table_get_on_id_low(table_id);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	if (node->table == NULL) {
+		/* The table has been dropped: no need to do purge */
+err_exit:
+		row_mysql_unfreeze_data_dictionary(trx);
+		return(FALSE);
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip purge of missing .ibd files */
+
+		node->table = NULL;
+
+		goto err_exit;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	if (clust_index == NULL) {
+		/* The table was corrupt in the data dictionary */
+
+		goto err_exit;
+	}
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					     roll_ptr, info_bits, trx,
+					     node->heap, &(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	if (!(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		ptr = trx_undo_rec_get_partial_row(
+			ptr, clust_index, &node->row,
+			type == TRX_UNDO_UPD_DEL_REC,
+			node->heap);
+	}
+
+	return(TRUE);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_purge(
+/*======*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_ptr_t	roll_ptr;
+	ibool		purge_needed;
+	ibool		updated_extern;
+	trx_t*		trx;
+
+	ut_ad(node && thr);
+
+	trx = thr_get_trx(thr);
+
+	node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+						  &(node->reservation),
+						  node->heap);
+	if (!node->undo_rec) {
+		/* Purge completed for this query thread */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(DB_SUCCESS);
+	}
+
+	node->roll_ptr = roll_ptr;
+
+	if (node->undo_rec == &trx_purge_dummy_rec) {
+		purge_needed = FALSE;
+	} else {
+		purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
+							thr);
+		/* If purge_needed == TRUE, we must also remember to unfreeze
+		data dictionary! */
+	}
+
+	if (purge_needed) {
+		node->found_clust = FALSE;
+
+		node->index = dict_table_get_next_index(
+			dict_table_get_first_index(node->table));
+
+		if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+			row_purge_del_mark(node);
+
+		} else if (updated_extern
+			   || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+			row_purge_upd_exist_or_extern(node);
+		}
+
+		if (node->found_clust) {
+			btr_pcur_close(&(node->pcur));
+		}
+
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	trx_purge_rec_release(node->reservation);
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+	err = row_purge(node, thr);
+
+	ut_ad(err == DB_SUCCESS);
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c
new file mode 100644
index 00000000000..128ac3ba3e8
--- /dev/null
+++ b/storage/innobase/row/row0row.c
@@ -0,0 +1,1168 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.c
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "data0type.h"
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "ha_prototypes.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "ut0mem.h"
+
+/*********************************************************************//**
+Gets the offset of trx id field, in bytes relative to the origin of
+a clustered index record.
+@return	offset of DATA_TRX_ID */
+UNIV_INTERN
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const rec_t*	rec __attribute__((unused)),
+				/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	pos;
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	offset = rec_get_nth_field_offs(offsets, pos, &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	return(offset);
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*	row,	/*!< in: row which should be
+				inserted or purged */
+	row_ext_t*	ext,	/*!< in: externally stored column prefixes,
+				or NULL */
+	dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
+				the index entry is allocated */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	ulint		i;
+
+	ut_ad(row && index && heap);
+	ut_ad(dtuple_check_typed(row));
+
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+		/* There may only be externally stored columns
+		in a clustered index B-tree of a user table. */
+		ut_a(!ext);
+	} else {
+		dtuple_set_n_fields_cmp(
+			entry, dict_index_get_n_unique_in_tree(index));
+	}
+
+	for (i = 0; i < entry_len; i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= ind_field->col;
+		ulint			col_no
+			= dict_col_get_no(col);
+		dfield_t*		dfield
+			= dtuple_get_nth_field(entry, i);
+		const dfield_t*		dfield2
+			= dtuple_get_nth_field(row, col_no);
+		ulint			len
+			= dfield_get_len(dfield2);
+
+		dfield_copy(dfield, dfield2);
+
+		if (dfield_is_null(dfield) || ind_field->prefix_len == 0) {
+			continue;
+		}
+
+		/* If a column prefix index, take only the prefix.
+		Prefix-indexed columns may be externally stored. */
+		ut_ad(col->ord_part);
+
+		if (UNIV_LIKELY_NULL(ext)) {
+			/* See if the column is stored externally. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					return(NULL);
+				}
+				dfield_set_data(dfield, buf, len);
+			}
+		} else if (dfield_is_ext(dfield)) {
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			len -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_a(ind_field->prefix_len <= len
+			     || dict_index_is_clust(index));
+		}
+
+		len = dtype_get_at_most_n_mbchars(
+			col->prtype, col->mbminlen, col->mbmaxlen,
+			ind_field->prefix_len, len, dfield_get_data(dfield));
+		dfield_set_len(dfield, len);
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return	own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*		row;
+	const dict_table_t*	table;
+	ulint			n_fields;
+	ulint			n_ext_cols;
+	ulint*			ext_cols	= NULL; /* remove warning */
+	ulint			len;
+	ulint			row_len;
+	byte*			buf;
+	ulint			i;
+	ulint			j;
+	mem_heap_t*		tmp_heap	= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(index && rec && heap);
+	ut_ad(dict_index_is_clust(index));
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &tmp_heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	if (type != ROW_COPY_POINTERS) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, (ulint*) offsets);
+	}
+
+	table = index->table;
+	row_len = dict_table_get_n_cols(table);
+
+	row = dtuple_create(heap, row_len);
+
+	dict_table_copy_types(row, table);
+
+	dtuple_set_info_bits(row, rec_get_info_bits(
+				     rec, dict_table_is_comp(table)));
+
+	n_fields = rec_offs_n_fields(offsets);
+	n_ext_cols = rec_offs_n_extern(offsets);
+	if (n_ext_cols) {
+		ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols);
+	}
+
+	for (i = j = 0; i < n_fields; i++) {
+		dict_field_t*		ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+		ulint			col_no
+			= dict_col_get_no(col);
+		dfield_t*		dfield
+			= dtuple_get_nth_field(row, col_no);
+
+		if (ind_field->prefix_len == 0) {
+
+			const byte*	field = rec_get_nth_field(
+				rec, offsets, i, &len);
+
+			dfield_set_data(dfield, field, len);
+		}
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+
+			if (UNIV_LIKELY_NULL(col_table)) {
+				ut_a(col_no
+				     < dict_table_get_n_cols(col_table));
+				col = dict_table_get_nth_col(
+					col_table, col_no);
+			}
+
+			if (col->ord_part) {
+				/* We will have to fetch prefixes of
+				externally stored columns that are
+				referenced by column prefixes. */
+				ext_cols[j++] = col_no;
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(row));
+
+	if (j) {
+		*ext = row_ext_create(j, ext_cols, row,
+				      dict_table_zip_size(index->table),
+				      heap);
+	} else {
+		*ext = NULL;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(row);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	ulint		i;
+	const byte*	field;
+	ulint		len;
+	ulint		rec_len;
+
+	ut_ad(rec && heap && index);
+	/* Because this function may be invoked by row0merge.c
+	on a record whose header is in different format, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+	ut_ad(n_ext);
+	*n_ext = 0;
+
+	rec_len = rec_offs_n_fields(offsets);
+
+	entry = dtuple_create(heap, rec_len);
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(rec_len == dict_index_get_n_fields(index));
+
+	dict_index_copy_types(entry, index, rec_len);
+
+	for (i = 0; i < rec_len; i++) {
+
+		dfield = dtuple_get_nth_field(entry, i);
+		field = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+			(*n_ext)++;
+		}
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return	own: index entry built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	ulint			type,	/*!< in: ROW_COPY_DATA, or
+					ROW_COPY_POINTERS: the former
+					copies also the data fields to
+					heap as the latter only places
+					pointers to data fields on the
+					index page */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the dtuple is used! */
+	const dict_index_t*	index,	/*!< in: index */
+	ulint*			offsets,/*!< in/out: rec_get_offsets(rec) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*	entry;
+	byte*		buf;
+
+	ut_ad(rec && heap && index);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap);
+
+	dtuple_set_info_bits(entry,
+			     rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return	own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dtuple_t*	ref;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index && rec && heap);
+	ut_ad(!dict_index_is_clust(index));
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, &tmp_heap);
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	trx_t*			trx)	/*!< in: transaction */
+{
+	const dict_index_t*	clust_index;
+	dfield_t*		dfield;
+	const byte*		field;
+	ulint			len;
+	ulint			ref_len;
+	ulint			pos;
+	ulint			clust_col_prefix_len;
+	ulint			i;
+	mem_heap_t*		heap		= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_a(ref);
+	ut_a(index);
+	ut_a(rec);
+	ut_ad(!dict_index_is_clust(index));
+
+	if (UNIV_UNLIKELY(!index->table)) {
+		fputs("InnoDB: table ", stderr);
+notfound:
+		ut_print_name(stderr, trx, TRUE, index->table_name);
+		fputs(" for index ", stderr);
+		ut_print_name(stderr, trx, FALSE, index->name);
+		fputs(" not found\n", stderr);
+		ut_error;
+	}
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (UNIV_UNLIKELY(!clust_index)) {
+		fputs("InnoDB: clust index for table ", stderr);
+		goto notfound;
+	}
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	ulint		low_match;
+	rec_t*		rec;
+	dict_index_t*	index;
+
+	ut_ad(dtuple_check_typed(ref));
+
+	index = dict_table_get_first_index(table);
+
+	ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index));
+
+	btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != dtuple_get_n_fields(ref)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return	record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	ibool		found;
+	rec_t*		clust_rec;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+	mem_heap_free(heap);
+
+	btr_pcur_close(&pcur);
+
+	*clust_index = dict_table_get_first_index(table);
+
+	return(clust_rec);
+}
+
+/***************************************************************//**
+Searches an index record.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_index_entry(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	return(!page_rec_is_infimum(rec) && low_match == n_fields);
+}
+
+#include <my_sys.h>
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return	number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formated in hex */
+{
+	ulint	ret;
+
+	if (data_len <= sizeof(ullint)) {
+
+		ullint		value;
+		ibool		unsigned_type = prtype & DATA_UNSIGNED;
+
+		value = mach_read_int_type((const byte*) data,
+					   data_len, unsigned_type);
+
+		if (unsigned_type) {
+
+			ret = ut_snprintf(buf, buf_size, "%llu",
+					  value) + 1;
+		} else {
+
+			ret = ut_snprintf(buf, buf_size, "%lld",
+					  (long long) value) + 1;
+		}
+
+	} else {
+
+		*format_in_hex = TRUE;
+		ret = 0;
+	}
+
+	return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return	number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formated in hex */
+{
+	ulint	charset_coll;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	/* we assume system_charset_info is UTF-8 */
+
+	charset_coll = dtype_get_charset_coll(prtype);
+
+	if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+		return(ut_str_sql_format(data, data_len, buf, buf_size));
+	}
+	/* else */
+
+	if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+		*format_in_hex = TRUE;
+		return(0);
+	}
+	/* else */
+
+	return(innobase_raw_format(data, data_len, charset_coll,
+					  buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	ret;
+	ibool	format_in_hex;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	if (data_len == UNIV_SQL_NULL) {
+
+		ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1;
+
+		return(ut_min(ret, buf_size));
+	}
+
+	mtype = dict_field->col->mtype;
+	prtype = dict_field->col->prtype;
+
+	format_in_hex = FALSE;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ret = row_raw_format_int(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		break;
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+
+		ret = row_raw_format_str(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		break;
+	/* XXX support more data types */
+	default:
+
+		format_in_hex = TRUE;
+	}
+
+	if (format_in_hex) {
+
+		if (UNIV_LIKELY(buf_size > 2)) {
+
+			memcpy(buf, "0x", 2);
+			buf += 2;
+			buf_size -= 2;
+			ret = 2 + ut_raw_to_hex(data, data_len,
+						buf, buf_size);
+		} else {
+
+			buf[0] = '\0';
+			ret = 1;
+		}
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include "ut0dbg.h"
+
+void
+test_row_raw_format_int()
+{
+	ulint	ret;
+	char	buf[128];
+	ibool	format_in_hex;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+		      ret_expected, buf_expected, format_in_hex_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		ulint	i;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		format_in_hex = FALSE;\
+		fprintf(stderr, "TESTING \"\\x");\
+		for (i = 0; i < data_len; i++) {\
+			fprintf(stderr, "%02hhX", data[i]);\
+		}\
+		fprintf(stderr, "\", %lu, %lu, %lu\n",\
+                        (ulint) data_len, (ulint) prtype,\
+			(ulint) buf_size);\
+		ret = row_raw_format_int(data, data_len, prtype,\
+					 buf, buf_size, &format_in_hex);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+                }\
+                if (strcmp((char*) buf, buf_expected) != 0) {\
+                        fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+                                buf_expected, buf);\
+                        ok = FALSE;\
+                }\
+                if (format_in_hex != format_in_hex_expected) {\
+                        fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+                                (int) format_in_hex_expected,\
+				(int) format_in_hex);\
+                        ok = FALSE;\
+                }\
+                if (ok) {\
+                        fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+                                (ulint) ret, buf, (int) format_in_hex);\
+                } else {\
+                        return;\
+                }\
+        } while (0)
+
+#if 1
+	/* min values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, 0,
+		      buf, sizeof(buf), 5, "-128", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, 0,
+		      buf, sizeof(buf), 7, "-32768", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, 0,
+		      buf, sizeof(buf), 9, "-8388608", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+		      buf, sizeof(buf), 12, "-2147483648", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+		      buf, sizeof(buf), 14, "-549755813888", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+		      buf, sizeof(buf), 17, "-140737488355328", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+		      buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+		      buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+	/* min values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	/* max values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, 0,
+		      buf, sizeof(buf), 4, "127", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, 0,
+		      buf, sizeof(buf), 6, "32767", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+		      buf, sizeof(buf), 8, "8388607", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+		      buf, sizeof(buf), 11, "2147483647", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+		      buf, sizeof(buf), 13, "549755813887", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+		      buf, sizeof(buf), 16, "140737488355327", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+		      buf, sizeof(buf), 18, "36028797018963967", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+		      buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+	/* max values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 4, "255", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "65535", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 9, "16777215", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 11, "4294967295", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 14, "1099511627775", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 16, "281474976710655", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 18, "72057594037927935", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+	/* some random values */
+
+	CALL_AND_TEST("\x52", 1, 0,
+		      buf, sizeof(buf), 4, "-46", 0);
+
+	CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "14", 0);
+
+	CALL_AND_TEST("\x62\xCE", 2, 0,
+		      buf, sizeof(buf), 6, "-7474", 0);
+
+	CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "10710", 0);
+
+	CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+		      buf, sizeof(buf), 5, "-112", 0);
+
+	CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "41238", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+		      buf, sizeof(buf), 3, "-9", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "92", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+		      buf, sizeof(buf), 6, "-9117", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+	/* speed test */
+
+	speedo_t	speedo;
+	ulint		i;
+
+	speedo_reset(&speedo);
+
+	for (i = 0; i < 1000000; i++) {
+		row_raw_format_int("\x23", 1,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x23", 1,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+	}
+
+	speedo_show(&speedo);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.c
new file mode 100644
index 00000000000..3ef9726588e
--- /dev/null
+++ b/storage/innobase/row/row0sel.c
@@ -0,0 +1,4736 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.c
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "read0read.h"
+#include "buf0lru.h"
+#include "ha_prototypes.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return	TRUE if the columns are equal */
+static
+ibool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of a
+					multi-byte character */
+	ulint		mbmaxlen,	/*!< in: maximum length of a
+					multi-byte character */
+	const byte*	clust_field,	/*!< in: the locally stored part of
+					the clustered index column, including
+					the BLOB pointer; the clustered
+					index record must be covered by
+					a lock or a page latch to protect it
+					against deletion (rollback or purge) */
+	ulint		clust_len,	/*!< in: length of clust_field */
+	const byte*	sec_field,	/*!< in: column in secondary index */
+	ulint		sec_len,	/*!< in: length of sec_field */
+	ulint		zip_size)	/*!< in: compressed page size, or 0 */
+{
+	ulint	len;
+	byte	buf[DICT_MAX_INDEX_COL_LEN];
+
+	len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
+						      zip_size,
+						      clust_field, clust_len);
+
+	if (UNIV_UNLIKELY(len == 0)) {
+		/* The BLOB was being deleted as the server crashed.
+		There should not be any secondary index records
+		referring to this clustered index record, because
+		btr_free_externally_stored_field() is called after all
+		secondary index entries of the row have been purged. */
+		return(FALSE);
+	}
+
+	len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+					  sec_len, len, (const char*) buf);
+
+	return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
+}
+
+/********************************************************************//**
+Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return TRUE if the secondary record is equal to the corresponding
+fields in the clustered record, when compared with collation */
+static
+ibool
+row_sel_sec_rec_is_for_clust_rec(
+/*=============================*/
+	const rec_t*	sec_rec,	/*!< in: secondary index record */
+	dict_index_t*	sec_index,	/*!< in: secondary index */
+	const rec_t*	clust_rec,	/*!< in: clustered index record;
+					must be protected by a lock or
+					a page latch against deletion
+					in rollback or purge */
+	dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	const byte*	sec_field;
+	ulint		sec_len;
+	const byte*	clust_field;
+	ulint		n;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint		sec_offsets_[REC_OFFS_SMALL_SIZE];
+	ulint*		clust_offs	= clust_offsets_;
+	ulint*		sec_offs	= sec_offsets_;
+	ibool		is_equal	= TRUE;
+
+	rec_offs_init(clust_offsets_);
+	rec_offs_init(sec_offsets_);
+
+	if (rec_get_deleted_flag(clust_rec,
+				 dict_table_is_comp(clust_index->table))) {
+
+		/* The clustered index record is delete-marked;
+		it is not visible in the read view.  Besides,
+		if there are any externally stored columns,
+		some of them may have already been purged. */
+		return(FALSE);
+	}
+
+	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+				     ULINT_UNDEFINED, &heap);
+	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+				   ULINT_UNDEFINED, &heap);
+
+	n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+	for (i = 0; i < n; i++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			clust_pos;
+		ulint			clust_len;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(sec_index, i);
+		col = dict_field_get_col(ifield);
+		clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+		clust_field = rec_get_nth_field(
+			clust_rec, clust_offs, clust_pos, &clust_len);
+		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+		len = clust_len;
+
+		if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
+
+			if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+			}
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminlen, col->mbmaxlen,
+				ifield->prefix_len, len, (char*) clust_field);
+
+			if (rec_offs_nth_extern(clust_offs, clust_pos)
+			    && len < sec_len) {
+				if (!row_sel_sec_rec_is_for_blob(
+					    col->mtype, col->prtype,
+					    col->mbminlen, col->mbmaxlen,
+					    clust_field, clust_len,
+					    sec_field, sec_len,
+					    dict_table_zip_size(
+						    clust_index->table))) {
+					goto inequal;
+				}
+
+				continue;
+			}
+		}
+
+		if (0 != cmp_data_data(col->mtype, col->prtype,
+				       clust_field, len,
+				       sec_field, sec_len)) {
+inequal:
+			is_equal = FALSE;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(is_equal);
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return	own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(sel_node_t));
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->plans = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/*!< in: first variable in a list of variables */
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	exp = node->select_list;
+
+	while (var) {
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+		var = que_node_get_next(var);
+	}
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	func_node = node->select_list;
+
+	while (func_node) {
+		eval_node_set_int_val(func_node, 0);
+
+		func_node = que_node_get_next(func_node);
+	}
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/*!< in: record index */
+	const rec_t*	rec,	/*!< in: record in a clustered or non-clustered
+				index; must be protected by a page latch */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		mem_heap_t*	heap = NULL;
+		ibool		needs_copy;
+
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+							      field_no))) {
+
+				/* Copy an externally stored field to the
+				temporary heap */
+
+				heap = mem_heap_create(1);
+
+				data = btr_rec_copy_externally_stored_field(
+					rec, offsets,
+					dict_table_zip_size(index->table),
+					field_no, &len, heap);
+
+				ut_a(len != UNIV_SQL_NULL);
+
+				needs_copy = TRUE;
+			} else {
+				data = rec_get_nth_field(rec, offsets,
+							 field_no, &len);
+
+				if (len == UNIV_SQL_NULL) {
+					len = UNIV_SQL_NULL;
+				}
+
+				needs_copy = column->copy_val;
+			}
+
+			if (needs_copy) {
+				eval_node_copy_and_alloc_val(column, data,
+							     len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/*!< in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+					 * sizeof(sel_buf_t));
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/*!< in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			mem_free(sel_buf->data);
+		}
+	}
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+			ut_d(dfield_set_null(val));
+
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+		ut_ad(!dfield_is_ext(val));
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = dfield_get_data(val);
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+
+			goto next_col;
+		}
+
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = dfield_get_data(val);
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+	read_view_t*	read_view,	/*!< in: read view */
+	dict_index_t*	index,		/*!< in: plan node for table */
+	rec_t*		rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, plan->index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	mem_heap_t**    old_vers_heap,  /*!< out: old version heap to use */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	err;
+
+	if (*old_vers_heap) {
+		mem_heap_empty(*old_vers_heap);
+	} else {
+		*old_vers_heap = mem_heap_create(512);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, index, offsets, read_view, offset_heap,
+		*old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	const rec_t**	old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+
+	err = row_vers_build_for_semi_consistent_read(
+		rec, mtr, clust_index, offsets, offset_heap,
+		prebuilt->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return	TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(cond->args);
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return	TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select_node */
+	plan_t*		plan,	/*!< in: plan node for table */
+	rec_t*		rec,	/*!< in: record in a non-clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	rec_t**		out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	*out_rec = NULL;
+
+	offsets = rec_get_offsets(rec,
+				  btr_pcur_get_btr_cur(&plan->pcur)->index,
+				  offsets, ULINT_UNDEFINED, &heap);
+
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+	index = dict_table_get_first_index(plan->table);
+
+	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+				   BTR_SEARCH_LEAF, &plan->clust_pcur,
+				   0, mtr);
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(&(plan->clust_pcur))
+	    < dict_index_get_n_unique(index)) {
+
+		ut_a(rec_get_deleted_flag(rec,
+					  dict_table_is_comp(plan->table)));
+		ut_a(node->read_view);
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.c
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		goto func_exit;
+	}
+
+	offsets = rec_get_offsets(clust_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using READ COMMITTED isolation level
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		ulint	lock_type;
+		trx_t*	trx;
+
+		trx = thr_get_trx(thr);
+
+		if (srv_locks_unsafe_for_binlog
+		    || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(&plan->clust_pcur),
+			clust_rec, index, offsets,
+			node->row_lock_mode, lock_type, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+						   node->read_view)) {
+
+			err = row_sel_build_prev_vers(
+				node->read_view, index, clust_rec,
+				&offsets, &heap, &plan->old_vers_heap,
+				&old_vers, mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				goto func_exit;
+			}
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if ((old_vers
+		     || rec_get_deleted_flag(rec, dict_table_is_comp(
+						     plan->table)))
+		    && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
+							 clust_rec, index)) {
+			goto func_exit;
+		}
+	}
+
+	/* Fetch the columns needed in test conditions.  The clustered
+	index record is protected by a page latch that was acquired
+	when plan->clust_pcur was positioned.  The latch will not be
+	released until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, clust_rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+	*out_rec = clust_rec;
+func_exit:
+	err = DB_SUCCESS;
+err_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ulint
+sel_set_rec_lock(
+/*=============*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint			mode,	/*!< in: lock mode */
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+	ulint	err;
+
+	trx = thr_get_trx(thr);
+
+	if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+		}
+	}
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, mode, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, mode, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+	plan_t*		plan,		/*!< in: table plan */
+	ibool		search_latch_locked,
+					/*!< in: TRUE if the thread currently
+					has the search latch locked in
+					s-mode */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */
+	ulint		i;
+
+	if (search_latch_locked) {
+		has_search_latch = RW_S_LATCH;
+	}
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+					 que_node_get_val(exp));
+		}
+
+		/* Open pcur to the index */
+
+		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+					   BTR_SEARCH_LEAF, &plan->pcur,
+					   has_search_latch, mtr);
+	} else {
+		/* Open the cursor to the start or the end of the index
+		(FALSE: no init) */
+
+		btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
+					    &(plan->pcur), FALSE, mtr);
+	}
+
+	ut_ad(plan->n_rows_prefetched == 0);
+	ut_ad(plan->n_rows_fetched == 0);
+	ut_ad(plan->cursor_at_end == FALSE);
+
+	plan->pcur_is_open = TRUE;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
+						   &(plan->pcur), mtr);
+
+	/* If the cursor is traveling upwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE
+	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER
+	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/*!< in: plan */
+{
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+	sel_node_t*	node,	/*!< in: select node for a consistent read */
+	plan_t*		plan,	/*!< in: plan for a unique search in clustered
+				index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ulint		ret;
+	rec_offs_init(offsets_);
+
+	index = plan->index;
+
+	ut_ad(node->read_view);
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	row_sel_open_pcur(plan, TRUE, mtr);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (dict_index_is_clust(index)) {
+		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+						   node->read_view)) {
+			ret = SEL_RETRY;
+			goto func_exit;
+		}
+	} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
+
+		ret = SEL_RETRY;
+		goto func_exit;
+	}
+
+	/* Test the deleted flag. */
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	/* Fetch the columns needed in test conditions.  The index
+	record is protected by a page latch that was acquired when
+	plan->pcur was positioned.  The latch will not be released
+	until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	plan->n_rows_fetched++;
+	ret = SEL_FOUND;
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+
+/*********************************************************************//**
+Performs a select step.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel(
+/*====*/
+	sel_node_t*	node,	/*!< in: select node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+	ibool		search_latch_locked;
+	ibool		consistent_read;
+
+	/* The following flag becomes TRUE when we are doing a
+	consistent read from a non-clustered index and we must look
+	at the clustered index to find out the previous delete mark
+	state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	/* TRUE if the search was made using
+	a non-clustered index, and we had to
+	access the clustered record: now &mtr
+	contains a clustered index latch, and
+	&mtr must be committed before we move
+	to the next non-clustered record */
+	ulint		found_flag;
+	ulint		err;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(thr->run_node == node);
+
+	search_latch_locked = FALSE;
+
+	if (node->read_view) {
+		/* In consistent reads, we try to do with the hash index and
+		not to use the buffer page get. This is to reduce memory bus
+		load resulting from semaphore operations. The search latch
+		will be s-locked when we access an index with a unique search
+		condition, but not locked when we access an index with a
+		less selective search condition. */
+
+		consistent_read = TRUE;
+	} else {
+		consistent_read = FALSE;
+	}
+
+table_loop:
+	/* TABLE LOOP
+	----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+
+	mtr_start(&mtr);
+
+	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+	    && !plan->must_get_clust
+	    && !plan->table->big_rows) {
+		if (!search_latch_locked) {
+			rw_lock_s_lock(&btr_search_latch);
+
+			search_latch_locked = TRUE;
+		} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
+
+			/* There is an x-latch request waiting: release the
+			s-latch for a moment; as an s-latch here is often
+			kept for some 10 searches before being released,
+			a waiting x-latch request would block other threads
+			from acquiring an s-latch for a long time, lowering
+			performance significantly in multiprocessors. */
+
+			rw_lock_s_unlock(&btr_search_latch);
+			rw_lock_s_lock(&btr_search_latch);
+		}
+
+		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+		if (found_flag == SEL_FOUND) {
+
+			goto next_table;
+
+		} else if (found_flag == SEL_EXHAUSTED) {
+
+			goto table_exhausted;
+		}
+
+		ut_ad(found_flag == SEL_RETRY);
+
+		plan_reset_cursor(plan);
+
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+	}
+
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		search_latch_locked = FALSE;
+	}
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+
+		row_sel_open_pcur(plan, search_latch_locked, &mtr);
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+
+			goto next_rec;
+		}
+	}
+
+rec_loop:
+	/* RECORD LOOP
+	-----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+	    && !page_rec_is_supremum(rec)) {
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+
+		if (!consistent_read) {
+
+			/* If innodb_locks_unsafe_for_binlog option is used
+			or this session is using READ COMMITTED isolation
+			level, we lock only the record, i.e., next-key
+			locking is not used. */
+
+			rec_t*	next_rec = page_rec_get_next(rec);
+			ulint	lock_type;
+			trx_t*	trx;
+
+			trx = thr_get_trx(thr);
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			if (srv_locks_unsafe_for_binlog
+			    || trx->isolation_level
+			    == TRX_ISO_READ_COMMITTED) {
+
+				if (page_rec_is_supremum(next_rec)) {
+
+					goto skip_lock;
+				}
+
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+			}
+
+			err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+					       next_rec, index, offsets,
+					       node->row_lock_mode,
+					       lock_type, thr);
+
+			if (err != DB_SUCCESS) {
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+skip_lock:
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (!consistent_read) {
+		/* Try to place a lock on the index record */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using READ COMMITTED isolation level,
+		we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+		trx_t*	trx;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		trx = thr_get_trx(thr);
+
+		if (srv_locks_unsafe_for_binlog
+		    || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
+
+			if (page_rec_is_supremum(rec)) {
+
+				goto next_rec;
+			}
+
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+				       rec, index, offsets,
+				       node->row_lock_mode, lock_type, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+		    < plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	cons_read_requires_clust_rec = FALSE;
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (consistent_read) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (dict_index_is_clust(index)) {
+
+			if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+							   node->read_view)) {
+
+				err = row_sel_build_prev_vers(
+					node->read_view, index, rec,
+					&offsets, &heap, &plan->old_vers_heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+					/* Fetch the columns needed in
+					test conditions. The clustered
+					index record is protected by a
+					page latch that was acquired
+					by row_sel_open_pcur() or
+					row_sel_restore_pcur_pos().
+					The latch will not be released
+					until mtr_commit(mtr). */
+
+					row_sel_fetch_columns(
+						index, rec, offsets,
+						UT_LIST_GET_FIRST(
+							plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec,
+							node->read_view)) {
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions.  The record is
+	protected by a page latch that was acquired by
+	row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
+	will not be released until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+	    && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+					    &mtr);
+		mtr_has_extra_clust_latch = TRUE;
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(consistent_read);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec,
+					 dict_table_is_comp(plan->table))) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}
+
+	/* PHASE 6: Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+	    || plan->unique_search || plan->no_prefetch
+	    || plan->table->big_rows) {
+
+		/* No prefetch in operation: go to the next table */
+
+		goto next_table;
+	}
+
+	sel_push_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+
+		sel_pop_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:
+	ut_ad(!search_latch_locked);
+
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		ut_ad(!search_latch_locked);
+
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+
+		thr->run_node = que_node_get_parent(node);
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */
+
+	plan->cursor_at_end = TRUE;
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+		err = DB_SUCCESS;
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+		} else {
+			node->state = SEL_NODE_NO_MORE_ROWS;
+
+			thr->run_node = que_node_get_parent(node);
+		}
+
+		goto func_exit;
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */
+
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+	err = DB_SUCCESS;
+	goto func_exit;
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	ut_ad(!search_latch_locked);
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+func_exit:
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		i_lock_mode;
+	sym_node_t*	table_node;
+	sel_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started(thr_get_trx(thr));
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			/* Assign a read view for the query */
+			node->read_view = trx_assign_read_view(
+				thr_get_trx(thr));
+		} else {
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+
+			table_node = node->table_list;
+
+			while (table_node) {
+				err = lock_table(0, table_node->table,
+						 i_lock_mode, thr);
+				if (err != DB_SUCCESS) {
+					thr_get_trx(thr)->error_state = err;
+
+					return(NULL);
+				}
+
+				table_node = que_node_get_next(table_node);
+			}
+		}
+
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor
+		    && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+
+	thr->graph->last_sel_node = node;
+
+	if (err != DB_SUCCESS) {
+		thr_get_trx(thr)->error_state = err;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	sel_node = node->cursor_def;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+			if (node->into_list) {
+				sel_assign_into_var_values(node->into_list,
+							   sel_node);
+			} else {
+				void* ret = (*node->func->func)(
+					sel_node, node->func->arg);
+
+				if (!ret) {
+					sel_node->state
+						= SEL_NODE_NO_MORE_ROWS;
+				}
+			}
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+
+	sel_node->common.parent = node;
+
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		fprintf(stderr,
+			"InnoDB: Error: fetch called on a closed cursor\n");
+
+		thr_get_trx(thr)->error_state = DB_ERROR;
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return	always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+	void*	row,		/*!< in:  sel_node_t* */
+	void*	user_arg)	/*!< in:  not used */
+{
+	sel_node_t*	node = row;
+	que_node_t*	exp;
+	ulint		i = 0;
+
+	UT_NOT_USED(user_arg);
+
+	fprintf(stderr, "row_fetch_print: row %p\n", row);
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		const dtype_t*	type = dfield_get_type(dfield);
+
+		fprintf(stderr, " column %lu:\n", (ulong)i);
+
+		dtype_print(type);
+		putc('\n', stderr);
+
+		if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
+			ut_print_buf(stderr, dfield_get_data(dfield),
+				     dfield_get_len(dfield));
+			putc('\n', stderr);
+		} else {
+			fputs(" <NULL>;\n", stderr);
+		}
+
+		exp = que_node_get_next(exp);
+		i++;
+	}
+
+	return((void*)42);
+}
+
+/****************************************************************//**
+Callback function for fetch that stores an unsigned 4 byte integer to the
+location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
+= 4.
+@return	always returns NULL */
+UNIV_INTERN
+void*
+row_fetch_store_uint4(
+/*==================*/
+	void*	row,		/*!< in:  sel_node_t* */
+	void*	user_arg)	/*!< in:  data pointer */
+{
+	sel_node_t*	node = row;
+	ib_uint32_t*	val = user_arg;
+	ulint		tmp;
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	const dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_INT);
+	ut_a(dtype_get_prtype(type) & DATA_UNSIGNED);
+	ut_a(len == 4);
+
+	tmp = mach_read_from_4(dfield_get_data(dfield));
+	*val = (ib_uint32_t) tmp;
+
+	return(NULL);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		fputs(" ::: ", stderr);
+
+		arg = que_node_get_next(arg);
+	}
+
+	putc('\n', stderr);
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len,	/*!< in: MySQL key value length */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	byte*		original_buf	= buf;
+	const byte*	original_key_ptr = key_ptr;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	ulint		data_offset;
+	ulint		data_len;
+	ulint		data_field_len;
+	ibool		is_null;
+	const byte*	key_end;
+	ulint		n_fields = 0;
+
+	/* For documentation of the key value storage format in MySQL, see
+	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	field = dict_index_get_nth_field(index, 0);
+
+	if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+		/* A special case: we are looking for a position in the
+		generated clustered index which InnoDB automatically added
+		to a table with no primary key: the first and the only
+		ordering column is ROW_ID which InnoDB stored to the key_ptr
+		buffer. */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+	while (key_ptr < key_end) {
+
+		ulint	type = dfield_get_type(dfield)->mtype;
+		ut_a(field->col->mtype == type);
+
+		data_offset = 0;
+		is_null = FALSE;
+
+		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+			/* The first byte in the field tells if this is
+			an SQL NULL value */
+
+			data_offset = 1;
+
+			if (*key_ptr != 0) {
+				dfield_set_null(dfield);
+
+				is_null = TRUE;
+			}
+		}
+
+		/* Calculate data length and data field total length */
+
+		if (type == DATA_BLOB) {
+			/* The key field is a column prefix of a BLOB or
+			TEXT */
+
+			ut_a(field->prefix_len > 0);
+
+			/* MySQL stores the actual data length to the first 2
+			bytes after the optional SQL NULL marker byte. The
+			storage format is little-endian, that is, the most
+			significant byte at a higher address. In UTF-8, MySQL
+			seems to reserve field->prefix_len bytes for
+			storing this field in the key value buffer, even
+			though the actual value only takes data_len bytes
+			from the start. */
+
+			data_len = key_ptr[data_offset]
+				+ 256 * key_ptr[data_offset + 1];
+			data_field_len = data_offset + 2 + field->prefix_len;
+
+			data_offset += 2;
+
+			/* Now that we know the length, we store the column
+			value like it would be a fixed char field */
+
+		} else if (field->prefix_len > 0) {
+			/* Looks like MySQL pads unused end bytes in the
+			prefix with space. Therefore, also in UTF-8, it is ok
+			to compare with a prefix containing full prefix_len
+			bytes, and no need to take at most prefix_len / 3
+			UTF-8 characters from the start.
+			If the prefix is used as the upper end of a LIKE
+			'abc%' query, then MySQL pads the end with chars
+			0xff. TODO: in that case does it any harm to compare
+			with the full prefix_len bytes. How do characters
+			0xff in UTF-8 behave? */
+
+			data_len = field->prefix_len;
+			data_field_len = data_offset + data_len;
+		} else {
+			data_len = dfield_get_type(dfield)->len;
+			data_field_len = data_offset + data_len;
+		}
+
+		if (UNIV_UNLIKELY
+		    (dtype_get_mysql_type(dfield_get_type(dfield))
+		     == DATA_MYSQL_TRUE_VARCHAR)
+		    && UNIV_LIKELY(type != DATA_INT)) {
+			/* In a MySQL key value format, a true VARCHAR is
+			always preceded by 2 bytes of a length field.
+			dfield_get_type(dfield)->len returns the maximum
+			'payload' len in bytes. That does not include the
+			2 bytes that tell the actual data length.
+
+			We added the check != DATA_INT to make sure we do
+			not treat MySQL ENUM or SET as a true VARCHAR! */
+
+			data_len += 2;
+			data_field_len += 2;
+		}
+
+		/* Storing may use at most data_len bytes of buf */
+
+		if (UNIV_LIKELY(!is_null)) {
+			row_mysql_store_col_in_innobase_format(
+				dfield, buf,
+				FALSE, /* MySQL key value format col */
+				key_ptr + data_offset, data_len,
+				dict_table_is_comp(index->table));
+			buf += data_len;
+		}
+
+		key_ptr += data_field_len;
+
+		if (UNIV_UNLIKELY(key_ptr > key_end)) {
+			/* The last field in key was not a complete key field
+			but a prefix of it.
+
+			Print a warning about this! HA_READ_PREFIX_LAST does
+			not currently work in InnoDB with partial-field key
+			value prefixes. Since MySQL currently uses a padding
+			trick to calculate LIKE 'abc%' type queries there
+			should never be partial-field prefixes in searches. */
+
+			ut_print_timestamp(stderr);
+
+			fputs("  InnoDB: Warning: using a partial-field"
+			      " key prefix in search.\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+			fprintf(stderr, ". Last data field length %lu bytes,\n"
+				"InnoDB: key ptr now exceeds"
+				" key end by %lu bytes.\n"
+				"InnoDB: Key value in the MySQL format:\n",
+				(ulong) data_field_len,
+				(ulong) (key_ptr - key_end));
+			fflush(stderr);
+			ut_print_buf(stderr, original_key_ptr, key_len);
+			putc('\n', stderr);
+
+			if (!is_null) {
+				ulint	len = dfield_get_len(dfield);
+				dfield_set_len(dfield, len
+					       - (ulint) (key_ptr - key_end));
+			}
+		}
+
+		n_fields++;
+		field++;
+		dfield++;
+	}
+
+	ut_a(buf <= original_buf + buf_len);
+
+	/* We set the length of tuple to n_fields: we assume that the memory
+	area allocated for it is big enough (usually bigger than n_fields). */
+
+	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores the row id to the prebuilt struct. */
+static
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt */
+	const rec_t*		index_rec,	/*!< in: record */
+	const dict_index_t*	index,		/*!< in: index of the record */
+	const ulint*		offsets)	/*!< in: rec_get_offsets
+						(index_rec, index) */
+{
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(index_rec, index, offsets));
+
+	data = rec_get_nth_field(
+		index_rec, offsets,
+		dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+	if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
+		fprintf(stderr,
+			"InnoDB: Error: Row id field is"
+			" wrong length %lu in ", (ulong) len);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, "\n"
+			"InnoDB: Field number %lu, record:\n",
+			(ulong) dict_index_get_sys_col_pos(index,
+							   DATA_ROW_ID));
+		rec_print_new(stderr, index_rec, offsets);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
+static
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+	byte*		dest,	/*!< in/out: buffer where to store; NOTE
+				that BLOBs are not in themselves
+				stored here: the caller must allocate
+				and copy the BLOB into buffer before,
+				and pass the pointer to the BLOB in
+				'data' */
+	const mysql_row_templ_t* templ,
+				/*!< in: MySQL column template.
+				Its following fields are referenced:
+				type, is_unsigned, mysql_col_len,
+				mbminlen, mbmaxlen */
+	const byte*	data,	/*!< in: data to store */
+	ulint		len)	/*!< in: length of the data */
+{
+	byte*	ptr;
+	byte*	field_end;
+	byte*	pad_ptr;
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	switch (templ->type) {
+	case DATA_INT:
+		/* Convert integer data from Innobase to a little-endian
+		format, sign bit restored to normal */
+
+		ptr = dest + len;
+
+		for (;;) {
+			ptr--;
+			*ptr = *data;
+			if (ptr == dest) {
+				break;
+			}
+			data++;
+		}
+
+		if (!templ->is_unsigned) {
+			dest[len - 1] = (byte) (dest[len - 1] ^ 128);
+		}
+
+		ut_ad(templ->mysql_col_len == len);
+		break;
+
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field_end = dest + templ->mysql_col_len;
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, templ->mysql_length_bytes);
+		}
+
+		/* Copy the actual data */
+		ut_memcpy(dest, data, len);
+
+		/* Pad with trailing spaces. We pad with spaces also the
+		unused end of a >= 5.0.3 true VARCHAR column, just in case
+		MySQL expects its contents to be deterministic. */
+
+		pad_ptr = dest + len;
+
+		ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+		/* We handle UCS2 charset strings differently. */
+		if (templ->mbminlen == 2) {
+			/* A space char is two bytes, 0x0020 in UCS2 */
+
+			if (len & 1) {
+				/* A 0x20 has been stripped from the column.
+				Pad it back. */
+
+				if (pad_ptr < field_end) {
+					*pad_ptr = 0x20;
+					pad_ptr++;
+				}
+			}
+
+			/* Pad the rest of the string with 0x0020 */
+
+			while (pad_ptr < field_end) {
+				*pad_ptr = 0x00;
+				pad_ptr++;
+				*pad_ptr = 0x20;
+				pad_ptr++;
+			}
+		} else {
+			ut_ad(templ->mbminlen == 1);
+			/* space=0x20 */
+
+			memset(pad_ptr, 0x20, field_end - pad_ptr);
+		}
+		break;
+
+	case DATA_BLOB:
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+					 len);
+		break;
+
+	case DATA_MYSQL:
+		memcpy(dest, data, len);
+
+		ut_ad(templ->mysql_col_len >= len);
+		ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+		ut_ad(templ->mbmaxlen > templ->mbminlen
+		      || templ->mysql_col_len == len);
+		/* The following assertion would fail for old tables
+		containing UTF-8 ENUM columns due to Bug #9526. */
+		ut_ad(!templ->mbmaxlen
+		      || !(templ->mysql_col_len % templ->mbmaxlen));
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
+
+		if (templ->mbminlen != templ->mbmaxlen) {
+			/* Pad with spaces. This undoes the stripping
+			done in row0mysql.ic, function
+			row_mysql_store_col_in_innobase_format(). */
+
+			memset(dest + len, 0x20, templ->mysql_col_len - len);
+		}
+		break;
+
+	default:
+#ifdef UNIV_DEBUG
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+		/* Above are the valid column types for MySQL data. */
+#endif /* UNIV_DEBUG */
+		ut_ad(templ->mysql_col_len == len);
+		memcpy(dest, data, len);
+	}
+}
+
+/**************************************************************//**
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@return TRUE if success, FALSE if could not allocate memory for a BLOB
+(though we may also assert in that case) */
+static
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+	byte*		mysql_rec,	/*!< out: row in the MySQL format */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: Innobase record in the index
+					which was described in prebuilt's
+					template; must be protected by
+					a page latch */
+	const ulint*	offsets)	/*!< in: array returned by
+					rec_get_offsets() */
+{
+	mysql_row_templ_t*	templ;
+	mem_heap_t*		extern_field_heap	= NULL;
+	mem_heap_t*		heap;
+	const byte*		data;
+	ulint			len;
+	ulint			i;
+
+	ut_ad(prebuilt->mysql_template);
+	ut_ad(prebuilt->default_rec);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_free(prebuilt->blob_heap);
+		prebuilt->blob_heap = NULL;
+	}
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+
+		if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+						      templ->rec_field_no))) {
+
+			/* Copy an externally stored field to the temporary
+			heap */
+
+			ut_a(!prebuilt->trx->has_search_latch);
+
+			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+				if (prebuilt->blob_heap == NULL) {
+					prebuilt->blob_heap = mem_heap_create(
+						UNIV_PAGE_SIZE);
+				}
+
+				heap = prebuilt->blob_heap;
+			} else {
+				extern_field_heap
+					= mem_heap_create(UNIV_PAGE_SIZE);
+
+				heap = extern_field_heap;
+			}
+
+			/* NOTE: if we are retrieving a big BLOB, we may
+			already run out of memory in the next call, which
+			causes an assert */
+
+			data = btr_rec_copy_externally_stored_field(
+				rec, offsets,
+				dict_table_zip_size(prebuilt->table),
+				templ->rec_field_no, &len, heap);
+
+			ut_a(len != UNIV_SQL_NULL);
+		} else {
+			/* Field is stored in the row. */
+
+			data = rec_get_nth_field(rec, offsets,
+						 templ->rec_field_no, &len);
+
+			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
+			    && len != UNIV_SQL_NULL) {
+
+				/* It is a BLOB field locally stored in the
+				InnoDB record: we MUST copy its contents to
+				prebuilt->blob_heap here because later code
+				assumes all BLOB values have been copied to a
+				safe place. */
+
+				if (prebuilt->blob_heap == NULL) {
+					prebuilt->blob_heap = mem_heap_create(
+						UNIV_PAGE_SIZE);
+				}
+
+				data = memcpy(mem_heap_alloc(
+						prebuilt->blob_heap, len),
+						data, len);
+			}
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ, data, len);
+
+			/* Cleanup */
+			if (extern_field_heap) {
+				mem_heap_free(extern_field_heap);
+				extern_field_heap = NULL;
+			}
+
+			if (templ->mysql_null_bit_mask) {
+				/* It is a nullable column with a non-NULL
+				value */
+				mysql_rec[templ->mysql_null_byte_offset]
+					&= ~(byte) templ->mysql_null_bit_mask;
+			}
+		} else {
+			/* MySQL assumes that the field for an SQL
+			NULL value is set to the default value. */
+
+			mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+			memcpy(mysql_rec + templ->mysql_col_offset,
+			       (const byte*) prebuilt->default_rec
+			       + templ->mysql_col_offset,
+			       templ->mysql_col_len);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+	read_view_t*	read_view,	/*!< in: read view */
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, clust_index, offsets, read_view, offset_heap,
+		prebuilt->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/*!< in: secondary index where rec resides */
+	const rec_t*	rec,	/*!< in: record in a non-clustered index; if
+				this is a locking read, then rec is not
+				allowed to be delete-marked, and that would
+				not make sense either */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const rec_t**	out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	ulint**		offsets,/*!< in: offsets returned by
+				rec_get_offsets(rec, sec_index);
+				out: offsets returned by
+				rec_get_offsets(out_rec, clust_index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	const rec_t*	clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	trx_t*		trx;
+
+	*out_rec = NULL;
+	trx = thr_get_trx(thr);
+
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+				   sec_index, *offsets, trx);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+
+	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+				   PAGE_CUR_LE, BTR_SEARCH_LEAF,
+				   prebuilt->clust_pcur, 0, mtr);
+
+	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+	prebuilt->clust_pcur->trx_if_known = trx;
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(prebuilt->clust_pcur)
+	    < dict_index_get_n_unique(clust_index)) {
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.c
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(sec_index->table))
+		    || prebuilt->select_lock_type != LOCK_NONE) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: error clustered record"
+			      " for sec rec not found\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, sec_index);
+			fputs("\n"
+			      "InnoDB: sec index record ", stderr);
+			rec_print(stderr, rec, sec_index);
+			fputs("\n"
+			      "InnoDB: clust index record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			putc('\n', stderr);
+			trx_print(stderr, trx, 600);
+
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report"
+			      " to http://bugs.mysql.com\n", stderr);
+		}
+
+		clust_rec = NULL;
+
+		goto func_exit;
+	}
+
+	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+				   ULINT_UNDEFINED, offset_heap);
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(prebuilt->clust_pcur),
+			clust_rec, clust_index, *offsets,
+			prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
+		if (err != DB_SUCCESS) {
+
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		/* If the isolation level allows reading of uncommitted data,
+		then we never look for an earlier version */
+
+		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && !lock_clust_rec_cons_read_sees(
+			    clust_rec, clust_index, *offsets,
+			    trx->read_view)) {
+
+			/* The following call returns 'offsets' associated with
+			'old_vers' */
+			err = row_sel_build_prev_vers_for_mysql(
+				trx->read_view, clust_index, prebuilt,
+				clust_rec, offsets, offset_heap, &old_vers,
+				mtr);
+
+			if (err != DB_SUCCESS || old_vers == NULL) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if (clust_rec
+		    && (old_vers
+			|| rec_get_deleted_flag(rec, dict_table_is_comp(
+							sec_index->table)))
+		    && !row_sel_sec_rec_is_for_clust_rec(
+			    rec, sec_index, clust_rec, clust_index)) {
+			clust_rec = NULL;
+#ifdef UNIV_SEARCH_DEBUG
+		} else {
+			ut_a(clust_rec == NULL
+			     || row_sel_sec_rec_is_for_clust_rec(
+				     rec, sec_index, clust_rec, clust_index));
+#endif
+		}
+	}
+
+func_exit:
+	*out_rec = clust_rec;
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* We may use the cursor in update or in unlock_row():
+		store its position */
+
+		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+	}
+
+	err = DB_SUCCESS;
+err_exit:
+	return(err);
+}
+
+/********************************************************************//**
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@return TRUE if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+	ibool*		same_user_rec,	/*!< out: TRUE if we were able to restore
+					the cursor on a user record with the
+					same ordering prefix in in the
+					B-tree index */
+	ulint		latch_mode,	/*!< in: latch mode wished in
+					restoration */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	ibool		moves_up,	/*!< in: TRUE if the cursor moves up
+					in the index */
+	mtr_t*		mtr)		/*!< in: mtr; CAUTION: may commit
+					mtr temporarily! */
+{
+	ibool	success;
+	ulint	relative_position;
+
+	relative_position = pcur->rel_pos;
+
+	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+	*same_user_rec = success;
+
+	if (relative_position == BTR_PCUR_ON) {
+		if (success) {
+			return(FALSE);
+		}
+
+		if (moves_up) {
+			btr_pcur_move_to_next(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	if (relative_position == BTR_PCUR_AFTER
+	    || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
+
+		if (moves_up) {
+			return(TRUE);
+		}
+
+		if (btr_pcur_is_on_user_rec(pcur)) {
+			btr_pcur_move_to_prev(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_BEFORE
+	      || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
+
+	if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+	byte*		buf,		/*!< in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
+{
+	ulint			i;
+	mysql_row_templ_t*	templ;
+	byte*			cached_rec;
+	ut_ad(prebuilt->n_fetch_cached > 0);
+	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+		/* Copy cache record field by field, don't touch fields that
+		are not covered by current key */
+		cached_rec = prebuilt->fetch_cache[
+			prebuilt->fetch_cache_first];
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			templ = prebuilt->mysql_template + i;
+			ut_memcpy(buf + templ->mysql_col_offset,
+				  cached_rec + templ->mysql_col_offset,
+				  templ->mysql_col_len);
+			/* Copy NULL bit of the current field from cached_rec
+			to buf */
+			if (templ->mysql_null_bit_mask) {
+				buf[templ->mysql_null_byte_offset]
+					^= (buf[templ->mysql_null_byte_offset]
+					    ^ cached_rec[templ->mysql_null_byte_offset])
+					& (byte)templ->mysql_null_bit_mask;
+			}
+		}
+	}
+	else {
+		ut_memcpy(buf,
+			  prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+			  prebuilt->mysql_prefix_len);
+	}
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record to push; must
+					be protected by a page latch */
+	const ulint*	offsets)	/*!< in: rec_get_offsets() */
+{
+	byte*	buf;
+	ulint	i;
+
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_a(!prebuilt->templ_contains_blob);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+
+			/* A user has reported memory corruption in these
+			buffers in Linux. Put magic numbers there to help
+			to track a possible bug. */
+
+			buf = mem_alloc(prebuilt->mysql_row_len + 8);
+
+			prebuilt->fetch_cache[i] = buf + 4;
+
+			mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
+			mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
+					ROW_PREBUILT_FETCH_MAGIC_N);
+		}
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+
+	if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
+				  prebuilt->fetch_cache[
+					  prebuilt->n_fetch_cached],
+				  prebuilt, rec, offsets))) {
+		ut_error;
+	}
+
+	prebuilt->n_fetch_cached++;
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode.
+@return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+	const rec_t**	out_rec,/*!< out: record if found */
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct */
+	ulint**		offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: started mtr */
+{
+	dict_index_t*	index		= prebuilt->index;
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	const rec_t*	rec;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!prebuilt->templ_contains_blob);
+
+	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+				   BTR_SEARCH_LEAF, pcur,
+#ifndef UNIV_SEARCH_DEBUG
+				   RW_S_LATCH,
+#else
+				   0,
+#endif
+				   mtr);
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	*offsets = rec_get_offsets(rec, index, *offsets,
+				   ULINT_UNDEFINED, heap);
+
+	if (!lock_clust_rec_cons_read_sees(rec, index,
+					   *offsets, trx->read_view)) {
+
+		return(SEL_RETRY);
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	*out_rec = rec;
+
+	return(SEL_FOUND);
+}
+
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+ulint
+row_search_for_mysql(
+/*=================*/
+	byte*		buf,		/*!< in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/*!< in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */
+	ulint		direction)	/*!< in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+{
+	dict_index_t*	index		= prebuilt->index;
+	ibool		comp		= dict_table_is_comp(index->table);
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	const rec_t*	rec;
+	const rec_t*	result_rec;
+	const rec_t*	clust_rec;
+	ulint		err				= DB_SUCCESS;
+	ibool		unique_search			= FALSE;
+	ibool		unique_search_from_clust_index	= FALSE;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	ibool		moves_up			= FALSE;
+	ibool		set_also_gap_locks		= TRUE;
+	/* if the query is a plain locking SELECT, and the isolation level
+	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+	ibool		did_semi_consistent_read	= FALSE;
+	/* if the returned record was locked and we did a semi-consistent
+	read (fetch the newest committed version), then this is set to
+	TRUE */
+#ifdef UNIV_SEARCH_DEBUG
+	ulint		cnt				= 0;
+#endif /* UNIV_SEARCH_DEBUG */
+	ulint		next_offs;
+	ibool		same_user_rec;
+	mtr_t		mtr;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+
+	rec_offs_init(offsets_);
+
+	ut_ad(index && pcur && search_tuple);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you used"
+			" DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+
+		return(DB_MISSING_HISTORY);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+#if 0
+	/* August 19, 2005 by Heikki: temporarily disable this error
+	print until the cursor lock count is done correctly.
+	See bugs #12263 and #12456!*/
+
+	if (trx->n_mysql_tables_in_use == 0
+	    && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
+		/* Note that if MySQL uses an InnoDB temp table that it
+		created inside LOCK TABLES, then n_mysql_tables_in_use can
+		be zero; in that case select_lock_type is set to LOCK_X in
+		::start_stmt. */
+
+		fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
+		      "InnoDB: but it has not locked"
+		      " any tables in ::external_lock()!\n",
+		      stderr);
+		trx_print(stderr, trx, 600);
+		fputc('\n', stderr);
+	}
+#endif
+
+#if 0
+	fprintf(stderr, "Match mode %lu\n search tuple ",
+		(ulong) match_mode);
+	dtuple_print(search_tuple);
+	fprintf(stderr, "N tables locked %lu\n",
+		(ulong) trx->mysql_n_tables_locked);
+#endif
+	/*-------------------------------------------------------------*/
+	/* PHASE 0: Release a possible s-latch we are holding on the
+	adaptive hash index latch if there is someone waiting behind */
+
+	if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
+	    && trx->has_search_latch) {
+
+		/* There is an x-latch request on the adaptive hash index:
+		release the s-latch to reduce starvation and wait for
+		BTR_SEA_TIMEOUT rounds before trying to keep it again over
+		calls from MySQL */
+
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+
+		trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+	}
+
+	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
+	is set or session is using a READ COMMITED isolation level. Then
+	we are able to remove the record locks set here on an individual
+	row. */
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		trx->op_info = "starting index read";
+
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		trx->op_info = "fetching rows";
+
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+				ut_error;
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+
+			srv_n_rows_read++;
+			err = DB_SUCCESS;
+			goto func_exit;
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+			/* The previous returned row was popped from the fetch
+			cache, but the cache was not full at the time of the
+			popping: no more rows can exist in the result set */
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a
+	non-delete-marked matching record.
+
+	Note that in a unique secondary index there may be different
+	delete-marked versions of a record where only the primary key
+	values differ: thus in a secondary index we must use next-key
+	locks when locking delete-marked records. */
+
+	if (match_mode == ROW_SEL_EXACT
+	    && dict_index_is_unique(index)
+	    && dtuple_get_n_fields(search_tuple)
+	    == dict_index_get_n_unique(index)
+	    && (dict_index_is_clust(index)
+		|| !dtuple_contains_null(search_tuple))) {
+
+		/* Note above that a UNIQUE secondary index can contain many
+		rows with the same key value if one of the columns is the SQL
+		null. A clustered index under MySQL can never contain null
+		columns because we demand that all the columns in primary key
+		are non-null. */
+
+		unique_search = TRUE;
+
+		/* Even if the condition is unique, MySQL seems to try to
+		retrieve also a second row if a primary key contains more than
+		1 column. Return immediately if this is not a HANDLER
+		command. */
+
+		if (UNIV_UNLIKELY(direction != 0
+				  && !prebuilt->used_in_HANDLER)) {
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (UNIV_UNLIKELY(direction == 0)
+	    && unique_search
+	    && dict_index_is_clust(index)
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->used_in_HANDLER
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
+
+		mode = PAGE_CUR_GE;
+
+		unique_search_from_clust_index = TRUE;
+
+		if (trx->mysql_n_tables_locked == 0
+		    && prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view) {
+
+			/* This is a SELECT query done as a consistent read,
+			and the read view has already been allocated:
+			let us try a search shortcut through the hash
+			index.
+			NOTE that we must also test that
+			mysql_n_tables_locked == 0, because this might
+			also be INSERT INTO ... SELECT ... or
+			CREATE TABLE ... SELECT ... . Our algorithm is
+			NOT prepared to inserts interleaved with the SELECT,
+			and if we try that, we can deadlock on the adaptive
+			hash index semaphore! */
+
+#ifndef UNIV_SEARCH_DEBUG
+			if (!trx->has_search_latch) {
+				rw_lock_s_lock(&btr_search_latch);
+				trx->has_search_latch = TRUE;
+			}
+#endif
+			switch (row_sel_try_search_shortcut_for_mysql(
+					&rec, prebuilt, &offsets, &heap,
+					&mtr)) {
+			case SEL_FOUND:
+#ifdef UNIV_SEARCH_DEBUG
+				ut_a(0 == cmp_dtuple_rec(search_tuple,
+							 rec, offsets));
+#endif
+				/* At this point, rec is protected by
+				a page latch that was acquired by
+				row_sel_try_search_shortcut_for_mysql().
+				The latch will not be released until
+				mtr_commit(&mtr). */
+
+				if (!row_sel_store_mysql_rec(buf, prebuilt,
+							     rec, offsets)) {
+					err = DB_TOO_BIG_RECORD;
+
+					/* We let the main loop to do the
+					error handling */
+					goto shortcut_fails_too_big_rec;
+				}
+
+				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" shortcut\n", stderr); */
+
+				srv_n_rows_read++;
+
+				err = DB_SUCCESS;
+				goto release_search_latch_if_needed;
+
+			case SEL_EXHAUSTED:
+				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" record not found 2\n", stderr); */
+
+				err = DB_RECORD_NOT_FOUND;
+release_search_latch_if_needed:
+				if (trx->search_latch_timeout > 0
+				    && trx->has_search_latch) {
+
+					trx->search_latch_timeout--;
+
+					rw_lock_s_unlock(&btr_search_latch);
+					trx->has_search_latch = FALSE;
+				}
+
+				/* NOTE that we do NOT store the cursor
+				position */
+				goto func_exit;
+
+			case SEL_RETRY:
+				break;
+
+			default:
+				ut_ad(0);
+			}
+shortcut_fails_too_big_rec:
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+		}
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+	}
+
+	trx_start_if_not_started(trx);
+
+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    && prebuilt->select_lock_type != LOCK_NONE
+	    && trx->mysql_thd != NULL
+	    && thd_is_select(trx->mysql_thd)) {
+		/* It is a plain locking SELECT and the isolation
+		level is low: do not lock gaps */
+
+		set_also_gap_locks = FALSE;
+	}
+
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+			moves_up = TRUE;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = TRUE;
+	}
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (UNIV_LIKELY(direction != 0)) {
+		ibool	need_to_process = sel_restore_position_for_mysql(
+			&same_user_rec, BTR_SEARCH_LEAF,
+			pcur, moves_up, &mtr);
+
+		if (UNIV_UNLIKELY(need_to_process)) {
+			if (UNIV_UNLIKELY(prebuilt->row_read_type
+					  == ROW_READ_DID_SEMI_CONSISTENT)) {
+				/* We did a semi-consistent read,
+				but the record was removed in
+				the meantime. */
+				prebuilt->row_read_type
+					= ROW_READ_TRY_SEMI_CONSISTENT;
+			}
+		} else if (UNIV_LIKELY(prebuilt->row_read_type
+				       != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+			/* The cursor was positioned on the record
+			that we returned previously.  If we need
+			to repeat a semi-consistent read as a
+			pessimistic locking read, the record
+			cannot be skipped. */
+
+			goto next_rec;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+		btr_pcur_open_with_no_init(index, search_tuple, mode,
+					   BTR_SEARCH_LEAF,
+					   pcur, 0, &mtr);
+
+		pcur->trx_if_known = trx;
+
+		rec = btr_pcur_get_rec(pcur);
+
+		if (!moves_up
+		    && !page_rec_is_supremum(rec)
+		    && set_also_gap_locks
+		    && !(srv_locks_unsafe_for_binlog
+			 || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+
+			/* Try to place a gap lock on the next index record
+			to prevent phantoms in ORDER BY ... DESC queries */
+			const rec_t*	next = page_rec_get_next_const(rec);
+
+			offsets = rec_get_offsets(next, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+					       next, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_GAP, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto lock_wait_or_error;
+			}
+		}
+	} else {
+		if (mode == PAGE_CUR_G) {
+			btr_pcur_open_at_index_side(
+				TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
+				&mtr);
+		} else if (mode == PAGE_CUR_L) {
+			btr_pcur_open_at_index_side(
+				FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
+				&mtr);
+		}
+	}
+
+	if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+
+		if (trx->read_view == NULL
+		    && prebuilt->select_lock_type == LOCK_NONE) {
+
+			fputs("InnoDB: Error: MySQL is trying to"
+			      " perform a consistent read\n"
+			      "InnoDB: but the read view is not assigned!\n",
+			      stderr);
+			trx_print(stderr, trx, 600);
+			fputc('\n', stderr);
+			ut_a(0);
+		}
+	} else if (prebuilt->select_lock_type == LOCK_NONE) {
+		/* This is a consistent read */
+		/* Assign a read view for the query */
+
+		trx_assign_read_view(trx);
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		ulint	lock_mode;
+		if (prebuilt->select_lock_type == LOCK_S) {
+			lock_mode = LOCK_IS;
+		} else {
+			lock_mode = LOCK_IX;
+		}
+		err = lock_table(0, index->table, lock_mode, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+		prebuilt->sql_stat_start = FALSE;
+	}
+
+rec_loop:
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(!!page_rec_is_comp(rec) == comp);
+#ifdef UNIV_SEARCH_DEBUG
+	/*
+	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
+	page_get_page_no(page_align(rec)));
+	rec_print(rec);
+	*/
+#endif /* UNIV_SEARCH_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		if (set_also_gap_locks
+		    && !(srv_locks_unsafe_for_binlog
+			 || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+
+			/* Try to place a lock on the index record */
+
+			/* If innodb_locks_unsafe_for_binlog option is used
+			or this session is using a READ COMMITTED isolation
+			level we do not lock gaps. Supremum record is really
+			a gap and therefore we do not set locks there. */
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+					       rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_ORDINARY, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto lock_wait_or_error;
+			}
+		}
+		/* A page supremum record cannot be in the result set: skip
+		it now that we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* Do sanity checks in case our cursor has bumped into page
+	corruption */
+
+	if (comp) {
+		next_offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	} else {
+		next_offs = rec_get_next_offs(rec, FALSE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	}
+
+	if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
+
+wrong_offs:
+		if (srv_force_recovery == 0 || moves_up == FALSE) {
+			ut_print_timestamp(stderr);
+			buf_page_print(page_align(rec), 0);
+			fprintf(stderr,
+				"\nInnoDB: rec address %p,"
+				" buf block fix count %lu\n",
+				(void*) rec, (ulong)
+				btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
+				->page.buf_fix_count);
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". Run CHECK TABLE. You may need to\n"
+			      "InnoDB: restore from a backup, or"
+			      " dump + drop + reimport the table.\n",
+			      stderr);
+
+			err = DB_CORRUPTION;
+
+			goto lock_wait_or_error;
+		} else {
+			/* The user may be dumping a corrupt table. Jump
+			over the corruption to recover as much as possible. */
+
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the rest of the page.\n",
+			      stderr);
+
+			btr_pcur_move_to_last_on_page(pcur, &mtr);
+
+			goto next_rec;
+		}
+	}
+	/*-------------------------------------------------------------*/
+
+	/* Calculate the 'offsets' associated with 'rec' */
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+		if (!rec_validate(rec, offsets)
+		    || !btr_index_rec_validate(rec, index, FALSE)) {
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the record.\n",
+			      stderr);
+
+			goto next_rec;
+		}
+	}
+
+	/* Note that we cannot trust the up_match value in the cursor at this
+	place because we can arrive here after moving the cursor! Thus
+	we have to recompare rec and search_tuple to determine if they
+	match enough. */
+
+	if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		/* fputs("Comparing rec and search tuple\n", stderr); */
+
+		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+			if (set_also_gap_locks
+			    && !(srv_locks_unsafe_for_binlog
+				 || trx->isolation_level
+				 == TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* Try to place a gap lock on the index
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set or this session is not
+				using a READ COMMITTED isolation level. */
+
+				err = sel_set_rec_lock(
+					btr_pcur_get_block(pcur),
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			err = DB_RECORD_NOT_FOUND;
+			/* ut_print_name(stderr, index->name);
+			fputs(" record not found 3\n", stderr); */
+
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+
+			if (set_also_gap_locks
+			    && !(srv_locks_unsafe_for_binlog
+				 || trx->isolation_level
+				 == TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* Try to place a gap lock on the index
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set or this session is not
+				using a READ COMMITTED isolation level. */
+
+				err = sel_set_rec_lock(
+					btr_pcur_get_block(pcur),
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			err = DB_RECORD_NOT_FOUND;
+			/* ut_print_name(stderr, index->name);
+			fputs(" record not found 4\n", stderr); */
+
+			goto normal_return;
+		}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using a READ COMMITED isolation
+		level we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+
+		if (!set_also_gap_locks
+		    || srv_locks_unsafe_for_binlog
+		    || trx->isolation_level == TRX_ISO_READ_COMMITTED
+		    || (unique_search
+			&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
+
+			goto no_gap_lock;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		/* If we are doing a 'greater or equal than a primary key
+		value' search from a clustered index, and we find a record
+		that has that exact primary key value, then there is no need
+		to lock the gap before the record, because no insert in the
+		gap can be in our search range. That is, no phantom row can
+		appear that way.
+
+		An example: if col1 is the primary key, the search is WHERE
+		col1 >= 100, and we find a record where col1 = 100, then no
+		need to lock the gap before that record. */
+
+		if (index == clust_index
+		    && mode == PAGE_CUR_GE
+		    && direction == 0
+		    && dtuple_get_n_fields_cmp(search_tuple)
+		    == dict_index_get_n_unique(index)
+		    && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+no_gap_lock:
+			lock_type = LOCK_REC_NOT_GAP;
+		}
+
+		err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+				       rec, index, offsets,
+				       prebuilt->select_lock_type,
+				       lock_type, thr);
+
+		switch (err) {
+			const rec_t*	old_vers;
+		case DB_SUCCESS:
+			if (srv_locks_unsafe_for_binlog
+			    || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
+				/* Note that a record of
+				prebuilt->index was locked. */
+				prebuilt->new_rec_locks = 1;
+			}
+			break;
+		case DB_LOCK_WAIT:
+			if (UNIV_LIKELY(prebuilt->row_read_type
+					!= ROW_READ_TRY_SEMI_CONSISTENT)
+			    || index != clust_index) {
+
+				goto lock_wait_or_error;
+			}
+
+			/* The following call returns 'offsets'
+			associated with 'old_vers' */
+			err = row_sel_build_committed_vers_for_mysql(
+				clust_index, prebuilt, rec,
+				&offsets, &heap, &old_vers, &mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto lock_wait_or_error;
+			}
+
+			mutex_enter(&kernel_mutex);
+			if (trx->was_chosen_as_deadlock_victim) {
+				mutex_exit(&kernel_mutex);
+				err = DB_DEADLOCK;
+
+				goto lock_wait_or_error;
+			}
+			if (UNIV_LIKELY(trx->wait_lock != NULL)) {
+				lock_cancel_waiting_and_release(
+					trx->wait_lock);
+				prebuilt->new_rec_locks = 0;
+			} else {
+				mutex_exit(&kernel_mutex);
+
+				/* The lock was granted while we were
+				searching for the last committed version.
+				Do a normal locking read. */
+
+				offsets = rec_get_offsets(rec, index, offsets,
+							  ULINT_UNDEFINED,
+							  &heap);
+				err = DB_SUCCESS;
+				/* Note that a record of
+				prebuilt->index was locked. */
+				prebuilt->new_rec_locks = 1;
+				break;
+			}
+			mutex_exit(&kernel_mutex);
+
+			if (old_vers == NULL) {
+				/* The row was not yet committed */
+
+				goto next_rec;
+			}
+
+			did_semi_consistent_read = TRUE;
+			rec = old_vers;
+			break;
+		default:
+
+			goto lock_wait_or_error;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+
+		} else if (index == clust_index) {
+
+			/* Fetch a previous version of the row if the current
+			one is not visible in the snapshot; if we have a very
+			high force recovery level set, we try to avoid crashes
+			by skipping this lookup */
+
+			if (UNIV_LIKELY(srv_force_recovery < 5)
+			    && !lock_clust_rec_cons_read_sees(
+				    rec, index, offsets, trx->read_view)) {
+
+				rec_t*	old_vers;
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					trx->read_view, clust_index,
+					prebuilt, rec, &offsets, &heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec, trx->read_view)) {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+
+			ut_ad(index != clust_index);
+
+			goto requires_clust_rec;
+		}
+	}
+
+	/* NOTE that at this point rec can be an old version of a clustered
+	index record built for a consistent read. We cannot assume after this
+	point that rec is on a buffer pool page. Functions like
+	page_rec_is_comp() cannot be used! */
+
+	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
+
+		/* The record is delete-marked: we can skip it */
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE
+		    && !did_semi_consistent_read) {
+
+			/* No need to keep a lock on a delete-marked record
+			if we do not want to use next-key locking. */
+
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+
+		/* This is an optimization to skip setting the next key lock
+		on the record that follows this delete-marked record. This
+		optimization works because of the unique search criteria
+		which precludes the presence of a range lock between this
+		delete marked record and the record following it.
+
+		For now this is applicable only to clustered indexes while
+		doing a unique search. There is scope for further optimization
+		applicable to unique secondary indexes. Current behaviour is
+		to widen the scope of a lock on an already delete marked record
+		if the same record is deleted twice by the same transaction */
+		if (index == clust_index && unique_search) {
+			err = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+
+		goto next_rec;
+	}
+
+	/* Get the clustered index record if needed, if we did not do the
+	search using the clustered index. */
+
+	if (index != clust_index && prebuilt->need_to_access_clustered) {
+
+requires_clust_rec:
+		/* We use a 'goto' to the preceding label if a consistent
+		read of a secondary index record requires us to look up old
+		versions of the associated clustered index record. */
+
+		ut_ad(rec_offs_validate(rec, index, offsets));
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_has_extra_clust_latch = TRUE;
+
+		/* The following call returns 'offsets' associated with
+		'clust_rec'. Note that 'clust_rec' can be an old version
+		built for a consistent read. */
+
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+						      thr, &clust_rec,
+						      &offsets, &heap, &mtr);
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+			goto next_rec;
+		}
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+			/* Note that both the secondary index record
+			and the clustered index record were locked. */
+			ut_ad(prebuilt->new_rec_locks == 1);
+			prebuilt->new_rec_locks = 2;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
+
+			/* The record is delete marked: we can skip it */
+
+			if ((srv_locks_unsafe_for_binlog
+			     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* No need to keep a lock on a delete-marked
+				record if we do not want to use next-key
+				locking. */
+
+				row_unlock_for_mysql(prebuilt, TRUE);
+			}
+
+			goto next_rec;
+		}
+
+		if (prebuilt->need_to_access_clustered) {
+
+			result_rec = clust_rec;
+
+			ut_ad(rec_offs_validate(result_rec, clust_index,
+						offsets));
+		} else {
+			/* We used 'offsets' for the clust rec, recalculate
+			them for 'rec' */
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			result_rec = rec;
+		}
+	} else {
+		result_rec = rec;
+	}
+
+	/* We found a qualifying record 'result_rec'. At this point,
+	'offsets' are associated with 'result_rec'. */
+
+	ut_ad(rec_offs_validate(result_rec,
+				result_rec != rec ? clust_index : index,
+				offsets));
+
+	/* At this point, the clustered index record is protected
+	by a page latch that was acquired when pcur was positioned.
+	The latch will not be released until mtr_commit(&mtr). */
+
+	if ((match_mode == ROW_SEL_EXACT
+	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+	    && prebuilt->select_lock_type == LOCK_NONE
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->clust_index_was_generated
+	    && !prebuilt->used_in_HANDLER
+	    && prebuilt->template_type
+	    != ROW_MYSQL_DUMMY_TEMPLATE) {
+
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE.
+		Since we keep space in prebuilt only for the BLOBs of
+		a single row, we cannot cache rows in the case there
+		are BLOBs in the fields to be fetched. In HANDLER we do
+		not cache rows because there the cursor is a scrollable
+		cursor. */
+
+		row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
+						 offsets);
+		if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
+
+			goto got_row;
+		}
+
+		goto next_rec;
+	} else {
+		if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
+			memcpy(buf + 4, result_rec
+			       - rec_offs_extra_size(offsets),
+			       rec_offs_size(offsets));
+			mach_write_to_4(buf,
+					rec_offs_extra_size(offsets) + 4);
+		} else {
+			if (!row_sel_store_mysql_rec(buf, prebuilt,
+						     result_rec, offsets)) {
+				err = DB_TOO_BIG_RECORD;
+
+				goto lock_wait_or_error;
+			}
+		}
+
+		if (prebuilt->clust_index_was_generated) {
+			if (result_rec != rec) {
+				offsets = rec_get_offsets(
+					rec, index, offsets, ULINT_UNDEFINED,
+					&heap);
+			}
+			row_sel_store_row_id_to_prebuilt(prebuilt, rec,
+							 index, offsets);
+		}
+	}
+
+	/* From this point on, 'offsets' are invalid. */
+
+got_row:
+	/* We have an optimization to save CPU time: if this is a consistent
+	read on a unique condition on the clustered index, then we do not
+	store the pcur position, because any fetch next or prev will anyway
+	return 'end of file'. Exceptions are locking reads and the MySQL
+	HANDLER command where the user can move the cursor with PREV or NEXT
+	even after a unique search. */
+
+	if (!unique_search_from_clust_index
+	    || prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->used_in_HANDLER) {
+
+		/* Inside an update always store the cursor position */
+
+		btr_pcur_store_position(pcur, &mtr);
+	}
+
+	err = DB_SUCCESS;
+
+	goto normal_return;
+
+next_rec:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = FALSE;
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 5: Move the cursor to the next index record */
+
+	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
+		/* We must commit mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		btr_pcur_store_position(pcur, &mtr);
+
+		mtr_commit(&mtr);
+		mtr_has_extra_clust_latch = FALSE;
+
+		mtr_start(&mtr);
+		if (sel_restore_position_for_mysql(&same_user_rec,
+						   BTR_SEARCH_LEAF,
+						   pcur, moves_up, &mtr)) {
+#ifdef UNIV_SEARCH_DEBUG
+			cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+			goto rec_loop;
+		}
+	}
+
+	if (moves_up) {
+		if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
+not_moved:
+			btr_pcur_store_position(pcur, &mtr);
+
+			if (match_mode != 0) {
+				err = DB_RECORD_NOT_FOUND;
+			} else {
+				err = DB_END_OF_INDEX;
+			}
+
+			goto normal_return;
+		}
+	} else {
+		if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
+			goto not_moved;
+		}
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+	cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+	goto rec_loop;
+
+lock_wait_or_error:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = FALSE;
+
+	/*-------------------------------------------------------------*/
+
+	btr_pcur_store_position(pcur, &mtr);
+
+	mtr_commit(&mtr);
+	mtr_has_extra_clust_latch = FALSE;
+
+	trx->error_state = err;
+
+	/* The following is a patch for MySQL */
+
+	que_thr_stop_for_mysql(thr);
+
+	thr->lock_state = QUE_THR_LOCK_ROW;
+
+	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+		/* It was a lock wait, and it ended */
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		mtr_start(&mtr);
+
+		sel_restore_position_for_mysql(&same_user_rec,
+					       BTR_SEARCH_LEAF, pcur,
+					       moves_up, &mtr);
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
+		    && !same_user_rec) {
+
+			/* Since we were not able to restore the cursor
+			on the same user record, we cannot use
+			row_unlock_for_mysql() to unlock any records, and
+			we must thus reset the new rec lock info. Since
+			in lock0lock.c we have blocked the inheriting of gap
+			X-locks, we actually do not have any new record locks
+			set in this case.
+
+			Note that if we were able to restore on the 'same'
+			user record, it is still possible that we were actually
+			waiting on a delete-marked record, and meanwhile
+			it was removed by purge and inserted again by some
+			other user. But that is no problem, because in
+			rec_loop we will again try to set a lock, and
+			new_rec_lock_info in trx will be right at the end. */
+
+			prebuilt->new_rec_locks = 0;
+		}
+
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+#ifdef UNIV_SEARCH_DEBUG
+	/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+	goto func_exit;
+
+normal_return:
+	/*-------------------------------------------------------------*/
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	mtr_commit(&mtr);
+
+	if (prebuilt->n_fetch_cached > 0) {
+		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+		err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+	/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+	if (err == DB_SUCCESS) {
+		srv_n_rows_read++;
+	}
+
+func_exit:
+	trx->op_info = "";
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Set or reset the "did semi-consistent read" flag on return.
+	The flag did_semi_consistent_read is set if and only if
+	the record being returned was fetched with a semi-consistent read. */
+	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+	      || !did_semi_consistent_read);
+
+	if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
+		if (UNIV_UNLIKELY(did_semi_consistent_read)) {
+			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+		} else {
+			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		}
+	}
+	return(err);
+}
+
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return	TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+	trx_t*		trx,		/*!< in: transaction object */
+	const char*	norm_name)	/*!< in: concatenation of database name,
+					'/' char, table name */
+{
+	dict_table_t*	table;
+	ibool		ret	= FALSE;
+
+	table = dict_table_get(norm_name, FALSE);
+
+	if (table == NULL) {
+
+		return(FALSE);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	/* Start the transaction if it is not started yet */
+
+	trx_start_if_not_started_low(trx);
+
+	/* If there are locks on the table or some trx has invalidated the
+	cache up to our trx id, then ret = FALSE.
+	We do not check what type locks there are on the table, though only
+	IX type locks actually would require ret = FALSE. */
+
+	if (UT_LIST_GET_LEN(table->locks) == 0
+	    && ut_dulint_cmp(trx->id,
+			     table->query_cache_inv_trx_id) >= 0) {
+
+		ret = TRUE;
+
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !trx->read_view) {
+
+			trx->read_view = read_view_open_now(
+				trx->id, trx->global_read_view_heap);
+			trx->global_read_view = trx->read_view;
+		}
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return	value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index to read from */
+	const rec_t*	rec,		/*!< in: current rec */
+	ulint		col_no,		/*!< in: column number */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	ulint		len;
+	const byte*	data;
+	ib_uint64_t	value;
+	mem_heap_t*	heap = NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	ut_a(len != UNIV_SQL_NULL);
+	ut_a(len <= sizeof value);
+
+	/* we assume AUTOINC value cannot be negative */
+	value = mach_read_int_type(data, len, unsigned_type);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (!unsigned_type && (ib_int64_t) value < 0) {
+		value = 0;
+	}
+
+	return(value);
+}
+
+/*******************************************************************//**
+Get the last row.
+@return	current rec or NULL */
+static
+const rec_t*
+row_search_autoinc_get_rec(
+/*=======================*/
+	btr_pcur_t*	pcur,		/*!< in: the current cursor */
+	mtr_t*		mtr)		/*!< in: mini transaction */
+{
+	do {
+		const rec_t* rec = btr_pcur_get_rec(pcur);
+
+		if (page_rec_is_user_rec(rec)) {
+			return(rec);
+		}
+	} while (btr_pcur_move_to_prev(pcur, mtr));
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
+column name can't be found in index */
+UNIV_INTERN
+ulint
+row_search_max_autoinc(
+/*===================*/
+	dict_index_t*	index,		/*!< in: index to search */
+	const char*	col_name,	/*!< in: name of autoinc column */
+	ib_uint64_t*	value)		/*!< out: AUTOINC value read */
+{
+	ulint		i;
+	ulint		n_cols;
+	dict_field_t*	dfield = NULL;
+	ulint		error = DB_SUCCESS;
+
+	n_cols = dict_index_get_n_ordering_defined_by_user(index);
+
+	/* Search the index for the AUTOINC column name */
+	for (i = 0; i < n_cols; ++i) {
+		dfield = dict_index_get_nth_field(index, i);
+
+		if (strcmp(col_name, dfield->name) == 0) {
+			break;
+		}
+	}
+
+	*value = 0;
+
+	/* Must find the AUTOINC column name */
+	if (i < n_cols && dfield) {
+		mtr_t		mtr;
+		btr_pcur_t	pcur;
+
+		mtr_start(&mtr);
+
+		/* Open at the high/right end (FALSE), and INIT
+		cursor (TRUE) */
+		btr_pcur_open_at_index_side(
+			FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+		if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
+			const rec_t*	rec;
+
+			rec = row_search_autoinc_get_rec(&pcur, &mtr);
+
+			if (rec != NULL) {
+				ibool unsigned_type = (
+					dfield->col->prtype & DATA_UNSIGNED);
+
+				*value = row_search_autoinc_read_column(
+					index, rec, i, unsigned_type);
+			}
+		}
+
+		btr_pcur_close(&pcur);
+
+		mtr_commit(&mtr);
+	} else {
+		error = DB_RECORD_NOT_FOUND;
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.c
new file mode 100644
index 00000000000..9f9c814f1a5
--- /dev/null
+++ b/storage/innobase/row/row0uins.c
@@ -0,0 +1,350 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.c
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: undo node */
+{
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ulint		err;
+	ulint		n_tries		= 0;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+					    &mtr);
+	ut_a(success);
+
+	if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+		ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
+
+		/* Drop the index tree associated with the row in
+		SYS_INDEXES table: */
+
+		dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						    &(node->pcur), &mtr);
+		ut_a(success);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+
+	success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (success) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr_start(&mtr);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+					    &(node->pcur), &mtr);
+	ut_a(success);
+
+	btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+				   trx_is_recv(node->trx)
+				   ? RB_RECOVERY
+				   : RB_NORMAL, &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+	    && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry to remove */
+{
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		found;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* No need to distinguish RB_RECOVERY here, because we
+		are deleting a secondary index record: the distinction
+		between RB_NORMAL and RB_RECOVERY only matters when
+		deleting a record that contains externally stored
+		columns. */
+		ut_ad(!dict_index_is_clust(index));
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+					   RB_NORMAL, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry to insert */
+{
+	ulint	err;
+	ulint	n_tries	= 0;
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry);
+
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	ulint		type;
+	ulint		dummy;
+	ibool		dummy_extern;
+
+	ut_ad(node);
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+				    &dummy_extern, &undo_no, &table_id);
+	ut_ad(type == TRX_UNDO_INSERT_REC);
+	node->rec_type = type;
+
+	node->update = NULL;
+	node->table = dict_table_get_on_id(table_id, node->trx);
+
+	/* Skip the UNDO if we can't find the table or the .ibd file. */
+	if (UNIV_UNLIKELY(node->table == NULL)) {
+	} else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
+		node->table = NULL;
+	} else {
+		clust_index = dict_table_get_first_index(node->table);
+
+		if (clust_index != NULL) {
+			ptr = trx_undo_rec_get_row_ref(
+				ptr, clust_index, &node->ref, node->heap);
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: table ");
+			ut_print_name(stderr, node->trx, TRUE,
+				      node->table->name);
+			fprintf(stderr, " has no indexes, "
+				"ignoring the table\n");
+
+			node->table = NULL;
+		}
+	}
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+ulint
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node)	/*!< in: row undo node */
+{
+	ut_ad(node);
+	ut_ad(node->state == UNDO_NODE_INSERT);
+
+	row_undo_ins_parse_undo_rec(node);
+
+	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+
+	/* Iterate over all the indexes and undo the insert.*/
+
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
+
+	while (node->index != NULL) {
+		dtuple_t*	entry;
+		ulint		err;
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      node->index, node->heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(trx_is_recv(node->trx));
+		} else {
+			err = row_undo_ins_remove_sec(node->index, entry);
+
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	return(row_undo_ins_remove_clust_rec(node));
+}
diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c
new file mode 100644
index 00000000000..6be475d8c78
--- /dev/null
+++ b/storage/innobase/row/row0umod.c
@@ -0,0 +1,815 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.c
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/***********************************************************//**
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback.
+@return	TRUE if also previous modify or insert of this row should be undone */
+UNIV_INLINE
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	undo_no_t*	undo_no)/*!< out: the undo number */
+{
+	trx_undo_rec_t*	undo_rec;
+	trx_t*		trx;
+
+	trx = node->trx;
+
+	if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+		*undo_no = ut_dulint_zero;
+		return(FALSE);
+	}
+
+	undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+	*undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+	return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr; must be committed before
+				latching any further pages */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	ut_ad(success);
+
+	if (mode == BTR_MODIFY_LEAF) {
+
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+						| BTR_NO_UNDO_LOG_FLAG
+						| BTR_KEEP_SYS_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	} else {
+		mem_heap_t*	heap		= NULL;
+		big_rec_t*	dummy_big_rec;
+
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		err = btr_cur_pessimistic_update(
+			BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, &heap, &dummy_big_rec, node->update,
+			node->cmpl_info, thr, mtr);
+
+		ut_a(!dummy_big_rec);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Removes a clustered index record after undo if possible.
+@return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_remove_clust_low(
+/*==========================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr __attribute__((unused)), /*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	if (!success) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* Find out if we can remove the whole clustered index record */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+		/* Ok, we can remove */
+	} else {
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* Note that since this operation is analogous to purge,
+		we can free also inherited externally stored fields:
+		hence the RB_NONE in the call below */
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NONE, mtr);
+
+		/* The delete operation may fail if we have little
+		file space left: TODO: easiest to crash the database
+		and restart with more file space */
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return	DB_SUCCESS or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	ulint		err;
+	ibool		success;
+	ibool		more_vers;
+	undo_no_t	new_undo_no;
+
+	ut_ad(node && thr);
+
+	/* Check if also the previous version of the clustered index record
+	should be undone in this same rollback operation */
+
+	more_vers = row_undo_mod_undo_also_prev_vers(node, &new_undo_no);
+
+	pcur = &(node->pcur);
+
+	mtr_start(&mtr);
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+						    BTR_MODIFY_LEAF);
+		if (err != DB_SUCCESS) {
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+			/* We may have to modify tree structure: do a
+			pessimistic descent down the index tree */
+
+			mtr_start(&mtr);
+
+			err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+							    BTR_MODIFY_TREE);
+		}
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+	}
+
+	node->state = UNDO_NODE_FETCH_NEXT;
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	if (more_vers && err == DB_SUCCESS) {
+
+		/* Reserve the undo log record to the prior version after
+		committing &mtr: this is necessary to comply with the latching
+		order, as &mtr may contain the fsp latch which is lower in
+		the latch hierarchy than trx->undo_mutex. */
+
+		success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+		if (success) {
+			node->state = UNDO_NODE_PREV_VERS;
+		}
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	ibool		found;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* In crash recovery, the secondary index record may
+		be missing if the UPDATE did not have time to insert
+		the secondary index records before the crash.  When we
+		are undoing that UPDATE in crash recovery, the record
+		may be missing.
+
+		In normal processing, if an update ends in a deadlock
+		before it has inserted all updated secondary index
+		records, then the undo will not find those records. */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_start(&mtr_vers);
+
+	success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+					    &mtr_vers);
+	ut_a(success);
+
+	old_has = row_vers_old_has_index_entry(FALSE,
+					       btr_pcur_get_rec(&(node->pcur)),
+					       &mtr_vers, index, entry);
+	if (old_has) {
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						   btr_cur, TRUE, thr, &mtr);
+		ut_ad(err == DB_SUCCESS);
+	} else {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+			if (success) {
+				err = DB_SUCCESS;
+			} else {
+				err = DB_FAIL;
+			}
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+
+			/* No need to distinguish RB_RECOVERY here, because we
+			are deleting a secondary index record: the distinction
+			between RB_NORMAL and RB_RECOVERY only matters when
+			deleting a record that contains externally stored
+			columns. */
+			ut_ad(!dict_index_is_clust(index));
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+						   RB_NORMAL, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.c, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	ulint	err;
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_TREE);
+	return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@return	DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+	ulint		mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	upd_t*		update;
+	ulint		err		= DB_SUCCESS;
+	big_rec_t*	dummy_big_rec;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+
+	/* Ignore indexes that are being created. */
+	if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) {
+
+		return(DB_SUCCESS);
+	}
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	if (UNIV_UNLIKELY(!row_search_index_entry(index, entry,
+						  mode, &pcur, &mtr))) {
+		fputs("InnoDB: error in sec index entry del undo in\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+		      "InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+		putc('\n', stderr);
+		trx_print(stderr, trx, 0);
+		fputs("\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n", stderr);
+	} else {
+		btr_cur_t*	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						   btr_cur, FALSE, thr, &mtr);
+		ut_a(err == DB_SUCCESS);
+		heap = mem_heap_create(100);
+
+		update = row_upd_build_sec_rec_difference_binary(
+			index, entry, btr_cur_get_rec(btr_cur), trx, heap);
+		if (upd_get_n_fields(update) == 0) {
+
+			/* Do nothing */
+
+		} else if (mode == BTR_MODIFY_LEAF) {
+			/* Try an optimistic updating of the record, keeping
+			changes within the page */
+
+			err = btr_cur_optimistic_update(
+				BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
+				btr_cur, update, 0, thr, &mtr);
+			switch (err) {
+			case DB_OVERFLOW:
+			case DB_UNDERFLOW:
+			case DB_ZIP_OVERFLOW:
+				err = DB_FAIL;
+			}
+		} else {
+			ut_a(mode == BTR_MODIFY_TREE);
+			err = btr_cur_pessimistic_update(
+				BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
+				btr_cur, &heap, &dummy_big_rec,
+				update, 0, thr, &mtr);
+			ut_a(!dummy_big_rec);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err	= DB_SUCCESS;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(trx_is_recv(thr_get_trx(thr)));
+		} else {
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		ut_a(entry);
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			mem_heap_free(heap);
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		/* No change in secondary indexes */
+
+		return(DB_SUCCESS);
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field_binary(node->row, node->index,
+						     node->update)) {
+
+			/* Build the newest version of the index entry */
+			entry = row_build_index_entry(node->row, node->ext,
+						      index, heap);
+			ut_a(entry);
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the original
+			values because we do not know them. But this should
+			not cause problems because in row0sel.c, in queries
+			we always retrieve the clustered index record or an
+			earlier version of it, if the secondary index record
+			through which we do the search is delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(node, thr,
+								  index,
+								  entry);
+			if (err != DB_SUCCESS) {
+				mem_heap_free(heap);
+
+				return(err);
+			}
+
+			/* We may have to update the delete mark in the
+			secondary index record of the previous version of
+			the row. We also need to update the fields of
+			the secondary index record if we updated its fields
+			but alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc'. */
+			mem_heap_empty(heap);
+			entry = row_build_index_entry(node->undo_row,
+						      node->undo_ext,
+						      index, heap);
+			ut_a(entry);
+
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_LEAF, thr, index, entry);
+			if (err == DB_FAIL) {
+				err = row_undo_mod_del_unmark_sec_and_undo_update(
+					BTR_MODIFY_TREE, thr, index, entry);
+			}
+
+			if (err != DB_SUCCESS) {
+				mem_heap_free(heap);
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	ibool		dummy_extern;
+	trx_t*		trx;
+
+	ut_ad(node && thr);
+	trx = thr_get_trx(thr);
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+				    &dummy_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	node->table = dict_table_get_on_id(table_id, trx);
+
+	/* TODO: other fixes associated with DROP TABLE + rollback in the
+	same table by another user */
+
+	if (node->table == NULL) {
+		/* Table was dropped */
+		return;
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip undo operations to missing .ibd files */
+		node->table = NULL;
+
+		return;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+				       roll_ptr, info_bits, trx,
+				       node->heap, &(node->update));
+	node->new_roll_ptr = roll_ptr;
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(node && thr);
+	ut_ad(node->state == UNDO_NODE_MODIFY);
+
+	row_undo_mod_parse_undo_rec(node, thr);
+
+	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+		/* It is already undone, or will be undone by another query
+		thread, or table was dropped */
+
+		trx_undo_rec_release(node->trx, node->undo_no);
+		node->state = UNDO_NODE_FETCH_NEXT;
+
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
+
+	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+		err = row_undo_mod_upd_exist_sec(node, thr);
+
+	} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+		err = row_undo_mod_del_mark_sec(node, thr);
+	} else {
+		ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+		err = row_undo_mod_upd_del_sec(node, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_clust(node, thr);
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0undo.c b/storage/innobase/row/row0undo.c
new file mode 100644
index 00000000000..3d739c9689a
--- /dev/null
+++ b/storage/innobase/row/row0undo.c
@@ -0,0 +1,377 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.c
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return	own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx && parent && heap);
+
+	undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->state = UNDO_NODE_FETCH_NEXT;
+	undo->trx = trx;
+
+	btr_pcur_init(&(undo->pcur));
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: row undo node */
+{
+	dict_index_t*	clust_index;
+	ibool		found;
+	mtr_t		mtr;
+	ibool		ret;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+				      node->table, node->ref, &mtr);
+
+	rec = btr_pcur_get_rec(&(node->pcur));
+
+	offsets = rec_get_offsets(rec, clust_index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+					 row_get_rec_roll_ptr(rec, clust_index,
+							      offsets))) {
+
+		/* We must remove the reservation on the undo log record
+		BEFORE releasing the latch on the clustered index page: this
+		is to make sure that some thread will eventually undo the
+		modification corresponding to node->roll_ptr. */
+
+		/* fputs("--------------------undoing a previous version\n",
+		stderr); */
+
+		ret = FALSE;
+	} else {
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+				      offsets, NULL, &node->ext, node->heap);
+		if (node->update) {
+			node->undo_row = dtuple_copy(node->row, node->heap);
+			row_upd_replace(node->undo_row, &node->undo_ext,
+					clust_index, node->update, node->heap);
+		} else {
+			node->undo_row = NULL;
+			node->undo_ext = NULL;
+		}
+
+		btr_pcur_store_position(&(node->pcur), &mtr);
+
+		ret = TRUE;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_undo(
+/*=====*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		err;
+	trx_t*		trx;
+	roll_ptr_t	roll_ptr;
+	ibool		locked_data_dict;
+
+	ut_ad(node && thr);
+
+	trx = node->trx;
+
+	if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+							     trx->roll_limit,
+							     &roll_ptr,
+							     node->heap);
+		if (!node->undo_rec) {
+			/* Rollback completed for this query thread */
+
+			thr->run_node = que_node_get_parent(node);
+
+			return(DB_SUCCESS);
+		}
+
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+
+	} else if (node->state == UNDO_NODE_PREV_VERS) {
+
+		/* Undo should be done to the same clustered index record
+		again in this same rollback, restoring the previous version */
+
+		roll_ptr = node->new_roll_ptr;
+
+		node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+							   node->heap);
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+	}
+
+	/* Prevent DROP TABLE etc. while we are rolling back this row.
+	If we are doing a TABLE CREATE or some other dictionary operation,
+	then we already have dict_operation_lock locked in x-mode. Do not
+	try to lock again, because that would cause a hang. */
+
+	locked_data_dict = (trx->dict_operation_lock_mode == 0);
+
+	if (locked_data_dict) {
+
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	if (node->state == UNDO_NODE_INSERT) {
+
+		err = row_undo_ins(node);
+
+		node->state = UNDO_NODE_FETCH_NEXT;
+	} else {
+		ut_ad(node->state == UNDO_NODE_MODIFY);
+		err = row_undo_mod(node, thr);
+	}
+
+	if (locked_data_dict) {
+
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		err;
+	undo_node_t*	node;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	srv_activity_count++;
+
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	err = row_undo(node, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* SQL error detected */
+
+		fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
+			(ulong) err);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			fprintf(stderr,
+				"InnoDB: Error 13 means out of tablespace.\n"
+				"InnoDB: Consider increasing"
+				" your tablespace.\n");
+
+			exit(1);
+		}
+
+		ut_error;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c
new file mode 100644
index 00000000000..58dfd43ead9
--- /dev/null
+++ b/storage/innobase/row/row0upd.c
@@ -0,0 +1,2177 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.c
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return	TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: old value of index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n);	/*!< in: how many first fields to check */
+
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+NOTE that since we do not hold dict_operation_lock when leaving the
+function, it may be that the referencing table has been dropped when
+we leave this function: this function is only for heuristic use!
+
+@return TRUE if referenced */
+static
+ibool
+row_upd_index_is_referenced(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	dict_table_t*	table		= index->table;
+	dict_foreign_t*	foreign;
+	ibool		froze_data_dict	= FALSE;
+	ibool		is_referenced	= FALSE;
+
+	if (!UT_LIST_GET_FIRST(table->referenced_list)) {
+
+		return(FALSE);
+	}
+
+	if (trx->dict_operation_lock_mode == 0) {
+		row_mysql_freeze_data_dictionary(trx);
+		froze_data_dict = TRUE;
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		if (foreign->referenced_index == index) {
+
+			is_referenced = TRUE;
+			goto func_exit;
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+func_exit:
+	if (froze_data_dict) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	return(is_referenced);
+}
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return	DB_SUCCESS or an error code */
+static
+ulint
+row_upd_check_references_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	trx_t*		trx;
+	const rec_t*	rec;
+	ulint		n_ext;
+	ulint		err;
+	ibool		got_s_lock	= FALSE;
+
+	if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx = thr_get_trx(thr);
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+				       &n_ext, heap);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	if (trx->dict_operation_lock_mode == 0) {
+		got_s_lock = TRUE;
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->referenced_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+
+			if (foreign->foreign_table == NULL) {
+				dict_table_get(foreign->foreign_table_name,
+					       FALSE);
+			}
+
+			if (foreign->foreign_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				(foreign->foreign_table
+				 ->n_foreign_key_checks_running)++;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects 'foreign' from
+			being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				FALSE, foreign, table, entry, thr);
+
+			if (foreign->foreign_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				ut_a(foreign->foreign_table
+				     ->n_foreign_key_checks_running > 0);
+
+				(foreign->foreign_table
+				 ->n_foreign_key_checks_running)--;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			if (err != DB_SUCCESS) {
+
+				goto func_exit;
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	err = DB_SUCCESS;
+
+func_exit:
+	if (got_s_lock) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(upd_node_t));
+	node->common.type = QUE_NODE_UPDATE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->in_mysql_interface = FALSE;
+
+	node->row = NULL;
+	node->ext = NULL;
+	node->upd_row = NULL;
+	node->upd_ext = NULL;
+	node->index = NULL;
+	node->update = NULL;
+
+	node->foreign = NULL;
+	node->cascade_heap = NULL;
+	node->cascade_node = NULL;
+
+	node->select = NULL;
+
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;
+
+	node->cmpl_info = 0;
+
+	return(node);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		pos,	/*!< in: TRX_ID position in rec */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_write_trx_id_and_roll_ptr(
+			page_zip, rec, offsets, pos, trx_id, roll_ptr);
+	} else {
+		byte*	field;
+		ulint	len;
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+		trx_write_trx_id(field, trx_id);
+		trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	const dtuple_t*	entry,	/*!< in: index entry, where the memory buffers
+				for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+	dulint		val)	/*!< in: value to write */
+{
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		pos;
+
+	ut_ad(dict_index_is_clust(index));
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	dfield = dtuple_get_nth_field(entry, pos);
+	field = dfield_get_data(dfield);
+
+	if (type == DATA_TRX_ID) {
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			old_len;
+	ulint			new_len;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+		new_len = dfield_get_len(new_val);
+
+		if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+			/* A bug fixed on Dec 31st, 2004: we looked at the
+			SQL NULL size from the wrong field! We may backport
+			this fix also to 4.0. The merge to 5.0 will be made
+			manually immediately after we commit this to 4.1. */
+
+			new_len = dict_col_get_sql_null_size(
+				dict_index_get_nth_col(index,
+						       upd_field->field_no),
+				0);
+		}
+
+		old_len = rec_offs_nth_size(offsets, upd_field->field_no);
+
+		if (rec_offs_comp(offsets)
+		    && rec_offs_nth_sql_null(offsets,
+					     upd_field->field_no)) {
+			/* Note that in the compact table format, for a
+			variable length field, an SQL NULL will use zero
+			bytes in the offset array at the start of the physical
+			record, but a zero-length value (empty string) will
+			use one byte! Thus, we cannot use update-in-place
+			if we update an SQL NULL varchar to an empty string! */
+
+			old_len = UNIV_SQL_NULL;
+		}
+
+		if (dfield_is_ext(new_val) || old_len != new_len
+		    || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*		rec,	/*!< in/out: record where replaced */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	page_zip_des_t*	page_zip)/*!< in: compressed page with enough space
+				available, or NULL */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (rec_offs_comp(offsets)) {
+		rec_set_info_bits_new(rec, update->info_bits);
+	} else {
+		rec_set_info_bits_old(rec, update->info_bits);
+	}
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		ut_ad(!dfield_is_ext(new_val) ==
+		      !rec_offs_nth_extern(offsets, upd_field->field_no));
+
+		rec_set_nth_field(rec, offsets, upd_field->field_no,
+				  dfield_get_data(new_val),
+				  dfield_get_len(new_val));
+	}
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_write_rec(page_zip, rec, index, offsets, 0);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return	new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	trx_t*		trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
+	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr __attribute__((unused))) /*!< in: mtr */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr);
+
+	log_ptr += mach_write_compressed(log_ptr,
+					 dict_index_get_sys_col_pos(
+						 index, DATA_TRX_ID));
+
+	trx_write_roll_ptr(log_ptr, roll_ptr);
+	log_ptr += DATA_ROLL_PTR_LEN;
+
+	log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data of system field values.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint*		pos,	/*!< out: TRX_ID position in record */
+	trx_id_t*	trx_id,	/*!< out: trx id */
+	roll_ptr_t*	roll_ptr)/*!< out: roll ptr */
+{
+	ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+		return(NULL);
+	}
+
+	*roll_ptr = trx_read_roll_ptr(ptr);
+	ptr += DATA_ROLL_PTR_LEN;
+
+	ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+	const upd_t*	update,	/*!< in: update vector */
+	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
+				contain at least MLOG_BUF_MARGIN bytes
+				of free space; the buffer is closed
+				within this function */
+	mtr_t*		mtr)	/*!< in: mtr into whose log to write */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			len;
+	ulint			n_fields;
+	byte*			buf_end;
+	ulint			i;
+
+	n_fields = upd_get_n_fields(update);
+
+	buf_end = log_ptr + MLOG_BUF_MARGIN;
+
+	mach_write_to_1(log_ptr, update->info_bits);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, n_fields);
+
+	for (i = 0; i < n_fields; i++) {
+
+#if MLOG_BUF_MARGIN <= 30
+# error "MLOG_BUF_MARGIN <= 30"
+#endif
+
+		if (log_ptr + 30 > buf_end) {
+			mlog_close(mtr, log_ptr);
+
+			log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+			buf_end = log_ptr + MLOG_BUF_MARGIN;
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+
+		len = dfield_get_len(new_val);
+
+		log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+		log_ptr += mach_write_compressed(log_ptr, len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (log_ptr + len < buf_end) {
+				memcpy(log_ptr, dfield_get_data(new_val), len);
+
+				log_ptr += len;
+			} else {
+				mlog_close(mtr, log_ptr);
+
+				mlog_catenate_string(mtr,
+						     dfield_get_data(new_val),
+						     len);
+
+				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+				buf_end = log_ptr + MLOG_BUF_MARGIN;
+			}
+		}
+	}
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
+				built */
+	upd_t**		update_out)/*!< out: update vector */
+{
+	upd_t*		update;
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	ulint		info_bits;
+	ulint		i;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	info_bits = mach_read_from_1(ptr);
+	ptr++;
+	ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	update = upd_create(n_fields, heap);
+	update->info_bits = info_bits;
+
+	for (i = 0; i < n_fields; i++) {
+		ulint	field_no;
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &field_no);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		upd_field->field_no = field_no;
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+
+			if (end_ptr < ptr + len) {
+
+				return(NULL);
+			}
+
+			dfield_set_data(new_val,
+					mem_heap_dup(heap, ptr, len), len);
+			ptr += len;
+		} else {
+			dfield_set_null(new_val);
+		}
+	}
+
+	*update_out = update;
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return	own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: secondary index record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_SMALL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a secondary index */
+	ut_a(!dict_index_is_clust(index));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE that it may be that len != dfield_get_len(dfield) if we
+		are updating in a character set and collation where strings of
+		different length can be equal in an alphabetical comparison,
+		and also in the case where we have a column prefix index
+		and the last characters in the index field are spaces; the
+		latter case probably caused the assertion failures reported at
+		row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		roll_ptr_pos;
+	ulint		trx_id_pos;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a clustered index */
+	ut_a(dict_index_is_clust(index));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (i == trx_id_pos || i == roll_ptr_pos) {
+
+			goto skip_compare;
+		}
+
+		if (UNIV_UNLIKELY(!dfield_is_ext(dfield)
+				  != !rec_offs_nth_extern(offsets, i))
+		    || !dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			n_diff++;
+		}
+skip_compare:
+		;
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***********************************************************//**
+Fetch a prefix of an externally stored column.  This is similar
+to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@return	BLOB prefix */
+static
+byte*
+row_upd_ext_fetch(
+/*==============*/
+	const byte*	data,		/*!< in: 'internally' stored part of the
+					field containing also the reference to
+					the external part */
+	ulint		local_len,	/*!< in: length of data, in bytes */
+	ulint		zip_size,	/*!< in: nonzero=compressed BLOB
+					page size, zero for uncompressed
+					BLOBs */
+	ulint*		len,		/*!< in: length of prefix to fetch;
+					out: fetched length of the prefix */
+	mem_heap_t*	heap)		/*!< in: heap where to allocate */
+{
+	byte*	buf = mem_heap_alloc(heap, *len);
+
+	*len = btr_copy_externally_stored_field_prefix(buf, *len,
+						       zip_size,
+						       data, local_len);
+	/* We should never update records containing a half-deleted BLOB. */
+	ut_a(*len);
+
+	return(buf);
+}
+
+/***********************************************************//**
+Replaces the new column value stored in the update vector in
+the given index entry field. */
+static
+void
+row_upd_index_replace_new_col_val(
+/*==============================*/
+	dfield_t*		dfield,	/*!< in/out: data field
+					of the index entry */
+	const dict_field_t*	field,	/*!< in: index field */
+	const dict_col_t*	col,	/*!< in: field->col */
+	const upd_field_t*	uf,	/*!< in: update field */
+	mem_heap_t*		heap,	/*!< in: memory heap for allocating
+					and copying the new value */
+	ulint			zip_size)/*!< in: compressed page
+					 size of the table, or 0 */
+{
+	ulint		len;
+	const byte*	data;
+
+	dfield_copy_data(dfield, &uf->new_val);
+
+	if (dfield_is_null(dfield)) {
+		return;
+	}
+
+	len = dfield_get_len(dfield);
+	data = dfield_get_data(dfield);
+
+	if (field->prefix_len > 0) {
+		ibool		fetch_ext = dfield_is_ext(dfield)
+			&& len < (ulint) field->prefix_len
+			+ BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (fetch_ext) {
+			ulint	l = len;
+
+			len = field->prefix_len;
+
+			data = row_upd_ext_fetch(data, l, zip_size,
+						 &len, heap);
+		}
+
+		len = dtype_get_at_most_n_mbchars(col->prtype,
+						  col->mbminlen, col->mbmaxlen,
+						  field->prefix_len, len,
+						  (const char*) data);
+
+		dfield_set_data(dfield, data, len);
+
+		if (!fetch_ext) {
+			dfield_dup(dfield, heap);
+		}
+
+		return;
+	}
+
+	switch (uf->orig_len) {
+		byte*	buf;
+	case BTR_EXTERN_FIELD_REF_SIZE:
+		/* Restore the original locally stored
+		part of the column.  In the undo log,
+		InnoDB writes a longer prefix of externally
+		stored columns, so that column prefixes
+		in secondary indexes can be reconstructed. */
+		dfield_set_data(dfield,
+				data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				BTR_EXTERN_FIELD_REF_SIZE);
+		dfield_set_ext(dfield);
+		/* fall through */
+	case 0:
+		dfield_dup(dfield, heap);
+		break;
+	default:
+		/* Reconstruct the original locally
+		stored part of the column.  The data
+		will have to be copied. */
+		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+		buf = mem_heap_alloc(heap, uf->orig_len);
+		/* Copy the locally stored prefix. */
+		memcpy(buf, data,
+		       uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE);
+		/* Copy the BLOB pointer. */
+		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+		       BTR_EXTERN_FIELD_REF_SIZE);
+
+		dfield_set_data(dfield, buf, uf->orig_len);
+		dfield_set_ext(dfield);
+		break;
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the index so
+				that the field number in an upd_field is the
+				index position */
+	ibool		order_only,
+				/*!< in: if TRUE, limit the replacement to
+				ordering fields of index; note that this
+				does not work for non-clustered indexes. */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+{
+	ulint		i;
+	ulint		n_fields;
+	const ulint	zip_size	= dict_table_zip_size(index->table);
+
+	ut_ad(index);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	if (order_only) {
+		n_fields = dict_index_get_n_unique(index);
+	} else {
+		n_fields = dict_index_get_n_fields(index);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		uf = upd_get_field_by_field_no(update, i);
+
+		if (uf) {
+			row_upd_index_replace_new_col_val(
+				dtuple_get_nth_field(entry, i),
+				field, col, uf, heap, zip_size);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the
+				CLUSTERED index so that the field number in
+				an upd_field is the clustered index position */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+{
+	ulint			i;
+	const dict_index_t*	clust_index
+		= dict_table_get_first_index(index->table);
+	const ulint		zip_size
+		= dict_table_zip_size(index->table);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		uf = upd_get_field_by_field_no(
+			update, dict_col_get_clust_pos(col, clust_index));
+
+		if (uf) {
+			row_upd_index_replace_new_col_val(
+				dtuple_get_nth_field(entry, i),
+				field, col, uf, heap, zip_size);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+	ulint			n_ext_cols;
+	ulint*			ext_cols;
+	const dict_table_t*	table;
+
+	ut_ad(row);
+	ut_ad(ext);
+	ut_ad(index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(update);
+	ut_ad(heap);
+
+	n_cols = dtuple_get_n_fields(row);
+	table = index->table;
+	ut_ad(n_cols == dict_table_get_n_cols(table));
+
+	ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols);
+	n_ext_cols = 0;
+
+	dtuple_set_info_bits(row, update->info_bits);
+
+	for (col_no = 0; col_no < n_cols; col_no++) {
+
+		const dict_col_t*	col
+			= dict_table_get_nth_col(table, col_no);
+		const ulint		clust_pos
+			= dict_col_get_clust_pos(col, index);
+		dfield_t*		dfield;
+
+		if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+			continue;
+		}
+
+		dfield = dtuple_get_nth_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+
+			if (upd_field->field_no != clust_pos) {
+
+				continue;
+			}
+
+			dfield_copy_data(dfield, &upd_field->new_val);
+			break;
+		}
+
+		if (dfield_is_ext(dfield) && col->ord_part) {
+			ext_cols[n_ext_cols++] = col_no;
+		}
+	}
+
+	if (n_ext_cols) {
+		*ext = row_ext_create(n_ext_cols, ext_cols, row,
+				      dict_table_zip_size(table), heap);
+	} else {
+		*ext = NULL;
+	}
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update)	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+{
+	ulint		n_unique;
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+
+	n_unique = dict_index_get_n_unique(index);
+	n_upd_fields = upd_get_n_fields(update);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n_unique; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+		ulint			col_no;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+		col_no = dict_col_get_no(col);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			/* Note that if the index field is a column prefix
+			then it may be that row does not contain an externally
+			stored part of the column value, and we cannot compare
+			the datas */
+
+			if (col_pos == upd_field->field_no
+			    && (row == NULL
+				|| ind_field->prefix_len > 0
+				|| !dfield_datas_are_binary_equal(
+					dtuple_get_nth_field(row, col_no),
+					&(upd_field->new_val)))) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update)	/*!< in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_index_t*	index;
+	ulint		i;
+
+	index = dict_table_get_first_index(table);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		if (dict_field_get_col(dict_index_get_nth_field(
+					       index, upd_field->field_no))
+		    ->ord_part) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return	TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n)	/*!< in: how many first fields to check */
+{
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+	ut_ad(n <= dict_index_get_n_fields(index));
+
+	n_upd_fields = upd_get_n_fields(update);
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+
+		ut_a(ind_field->prefix_len == 0);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			    && !dfield_datas_are_binary_equal(
+				    dtuple_get_nth_field(entry, i),
+				    &(upd_field->new_val))) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/*!< in: record in a clustered index */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_field(rec, offsets,
+					 column->field_nos[SYM_CLUST_FIELD_NO],
+					 &len);
+		if (len == UNIV_SQL_NULL) {
+			len = UNIV_SQL_NULL;
+		}
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/*!< in/out: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/***********************************************************//**
+Stores to the heap the row on which the node->pcur is positioned. */
+static
+void
+row_upd_store_row(
+/*==============*/
+	upd_node_t*	node)	/*!< in: row update node */
+{
+	dict_index_t*	clust_index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	rec = btr_pcur_get_rec(node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+			      NULL, &node->ext, node->heap);
+	if (node->is_delete) {
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+	} else {
+		node->upd_row = dtuple_copy(node->row, node->heap);
+		row_upd_replace(node->upd_row, &node->upd_ext,
+				clust_index, node->update, node->heap);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ibool		check_ref;
+	ibool		found;
+	dict_index_t*	index;
+	dtuple_t*	entry;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	mem_heap_t*	heap;
+	rec_t*		rec;
+	ulint		err	= DB_SUCCESS;
+	mtr_t		mtr;
+	trx_t*		trx	= thr_get_trx(thr);
+
+	index = node->index;
+
+	check_ref = row_upd_index_is_referenced(index, trx);
+
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, node->ext, index, heap);
+	ut_a(entry);
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+				       &mtr);
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	rec = btr_cur_get_rec(btr_cur);
+
+	if (UNIV_UNLIKELY(!found)) {
+		fputs("InnoDB: error in sec index entry update in\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+		      "InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, rec, index);
+		putc('\n', stderr);
+
+		trx_print(stderr, trx, 0);
+
+		fputs("\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n", stderr);
+	} else {
+		/* Delete mark the old index record; it can already be
+		delete marked if we return after a lock wait in
+		row_ins_index_entry below */
+
+		if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(index->table))) {
+			err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
+							   thr, &mtr);
+			if (err == DB_SUCCESS && check_ref) {
+
+				ulint*	offsets = rec_get_offsets(
+					rec, index, NULL,
+					ULINT_UNDEFINED, &heap);
+				/* NOTE that the following call loses
+				the position of pcur ! */
+				err = row_upd_check_references_constraints(
+					node, &pcur, index->table,
+					index, offsets, thr, &mtr);
+			}
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete || err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Build a new index entry */
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	/* Insert new index entry */
+	err = row_ins_index_entry(index, entry, 0, TRUE, thr);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+UNIV_INLINE
+ulint
+row_upd_sec_step(
+/*=============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+	      || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!dict_index_is_clust(node->index));
+
+	if (node->state == UPD_NODE_UPDATE_ALL_SEC
+	    || row_upd_changes_ord_field_binary(node->row, node->index,
+						node->update)) {
+		return(row_upd_sec_index_entry(node, thr));
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ibool		check_ref,/*!< in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	mem_heap_t*	heap	= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	ulint		err;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+
+	if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+		rec_t*		rec;
+		dict_index_t*	index;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		offsets;
+		rec_offs_init(offsets_);
+
+		err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+						     btr_cur, TRUE, thr, mtr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+			return(err);
+		}
+
+		/* Mark as not-owned the externally stored fields which the new
+		row inherits from the delete marked record: purge should not
+		free those externally stored fields even if the delete marked
+		record is removed from the index tree, or updated. */
+
+		rec = btr_cur_get_rec(btr_cur);
+		index = dict_table_get_first_index(table);
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+		btr_cur_mark_extern_inherited_fields(
+			btr_cur_get_page_zip(btr_cur),
+			rec, index, offsets, node->update, mtr);
+		if (check_ref) {
+			/* NOTE that the following call loses
+			the position of pcur ! */
+			err = row_upd_check_references_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+			if (err != DB_SUCCESS) {
+				mtr_commit(mtr);
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				return(err);
+			}
+		}
+	}
+
+	mtr_commit(mtr);
+
+	if (!heap) {
+		heap = mem_heap_create(500);
+	}
+	node->state = UPD_NODE_INSERT_CLUSTERED;
+
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+
+	if (node->upd_ext) {
+		/* If we return from a lock wait, for example, we may have
+		extern fields marked as not-owned in entry (marked in the
+		if-branch above). We must unmark them. */
+
+		btr_cur_unmark_dtuple_extern_fields(entry);
+
+		/* We must mark non-updated extern fields in entry as
+		inherited, so that a possible rollback will not free them. */
+
+		btr_cur_mark_dtuple_inherited_extern(entry, node->update);
+	}
+
+	err = row_ins_index_entry(index, entry,
+				  node->upd_ext ? node->upd_ext->n_ext : 0,
+				  TRUE, thr);
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	mem_heap_t*	heap	= NULL;
+	big_rec_t*	big_rec	= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+					      btr_cur, node->update,
+					      node->cmpl_info, thr, mtr);
+	} else {
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+
+		return(DB_SUCCESS);
+	}
+
+	if (buf_LRU_buf_pool_running_out()) {
+
+		return(DB_LOCK_TABLE_FULL);
+	}
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr_start(mtr);
+
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+					 &heap, &big_rec, node->update,
+					 node->cmpl_info, thr, mtr);
+	mtr_commit(mtr);
+
+	if (err == DB_SUCCESS && big_rec) {
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_t*		rec;
+		rec_offs_init(offsets_);
+
+		mtr_start(mtr);
+
+		ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+		rec = btr_cur_get_rec(btr_cur);
+		err = btr_store_big_rec_extern_fields(
+			index, btr_cur_get_block(btr_cur), rec,
+			rec_get_offsets(rec, index, offsets_,
+					ULINT_UNDEFINED, &heap),
+			big_rec, mtr);
+		mtr_commit(mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (big_rec) {
+		dtuple_big_rec_free(big_rec);
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in/out: rec_get_offsets() for the
+				record under the cursor */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ibool		check_ref,/*!< in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(node->is_delete);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Store row because we have to build also the secondary index
+	entries */
+
+	row_upd_store_row(node);
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+					     btr_cur, TRUE, thr, mtr);
+	if (err == DB_SUCCESS && check_ref) {
+		/* NOTE that the following call loses the position of pcur ! */
+
+		err = row_upd_check_references_constraints(node,
+							   pcur, index->table,
+							   index, offsets,
+							   thr, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	ibool		success;
+	ibool		check_ref;
+	ulint		err;
+	mtr_t*		mtr;
+	mtr_t		mtr_buf;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets;
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+	mtr = &mtr_buf;
+
+	mtr_start(mtr);
+
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+	if (!success) {
+		err = DB_RECORD_NOT_FOUND;
+
+		mtr_commit(mtr);
+
+		return(err);
+	}
+
+	/* If this is a row in SYS_INDEXES table of the data dictionary,
+	then we have to free the file segments of the index tree associated
+	with the index */
+
+	if (node->is_delete
+	    && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+		dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+		mtr_commit(mtr);
+
+		mtr_start(mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+						    mtr);
+		if (!success) {
+			err = DB_ERROR;
+
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(
+			0, btr_pcur_get_block(pcur),
+			rec, index, offsets, thr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+			goto exit_func;
+		}
+	}
+
+	/* NOTE: the following function calls will also commit mtr */
+
+	if (node->is_delete) {
+		err = row_upd_del_mark_clust_rec(node, index, offsets,
+						 thr, check_ref, mtr);
+		if (err == DB_SUCCESS) {
+			node->state = UPD_NODE_UPDATE_ALL_SEC;
+			node->index = dict_table_get_next_index(index);
+		}
+exit_func:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return(err);
+	}
+
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+
+	if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+		row_upd_copy_columns(rec, offsets,
+				     UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+		err = row_upd_clust_rec(node, index, thr, mtr);
+		return(err);
+	}
+
+	row_upd_store_row(node);
+
+	if (row_upd_changes_ord_field_binary(node->row, index, node->update)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
+						  mtr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_ALL_SEC;
+	} else {
+		err = row_upd_clust_rec(node, index, thr, mtr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_SOME_SEC;
+	}
+
+	node->index = dict_table_get_next_index(index);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd(
+/*====*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err	= DB_SUCCESS;
+
+	ut_ad(node && thr);
+
+	if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+
+		if (node->is_delete
+		    || row_upd_changes_some_index_ord_field_binary(
+			    node->table, node->update)) {
+			node->cmpl_info = 0;
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	if (node->state == UPD_NODE_UPDATE_CLUSTERED
+	    || node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+		err = row_upd_clust_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+	}
+
+	if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+		goto function_exit;
+	}
+
+	while (node->index != NULL) {
+		err = row_upd_sec_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+function_exit:
+	if (err == DB_SUCCESS) {
+		/* Do some cleanup */
+
+		if (node->row != NULL) {
+			node->row = NULL;
+			node->ext = NULL;
+			node->upd_row = NULL;
+			node->upd_ext = NULL;
+			mem_heap_empty(node->heap);
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	ulint		err		= DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started(trx);
+
+	node = thr->run_node;
+
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			err = lock_table(0, node->table, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to update */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	return(thr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c
new file mode 100644
index 00000000000..a4fbb5289aa
--- /dev/null
+++ b/storage/innobase/row/row0vers.c
@@ -0,0 +1,741 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.c
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "lock0lock.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function!
+@return NULL if committed, else the active transaction */
+UNIV_INTERN
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	dict_index_t*	clust_index;
+	rec_t*		clust_rec;
+	ulint*		clust_offsets;
+	rec_t*		version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	dtuple_t*	entry	= NULL; /* assignment to eliminate compiler
+					warning */
+	trx_t*		trx;
+	ulint		rec_del;
+	ulint		err;
+	mtr_t		mtr;
+	ulint		comp;
+
+	ut_ad(mutex_own(&kernel_mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mutex_exit(&kernel_mutex);
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record: this is a time-consuming
+	operation: therefore we release the kernel mutex; also, the release
+	is required by the latching order convention. The latch on the
+	clustered index locks the top of the stack of versions. We also
+	reserve purge_latch to lock the bottom of the version stack. */
+
+	clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+				      &clust_index, &mtr);
+	if (!clust_rec) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.c
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		mutex_enter(&kernel_mutex);
+		mtr_commit(&mtr);
+
+		return(NULL);
+	}
+
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+
+	mtr_s_lock(&(purge_sys->latch), &mtr);
+
+	mutex_enter(&kernel_mutex);
+
+	trx = NULL;
+	if (!trx_is_active(trx_id)) {
+		/* The transaction that modified or inserted clust_rec is no
+		longer active: no implicit lock on rec */
+		goto exit_func;
+	}
+
+	if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
+				      clust_offsets, TRUE)) {
+		/* Corruption noticed: try to avoid a crash by returning */
+		goto exit_func;
+	}
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(index->table == clust_index->table);
+	ut_ad(!!comp == dict_table_is_comp(index->table));
+	ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+	/* We look up if some earlier version, which was modified by the trx_id
+	transaction, of the clustered index record would require rec to be in
+	a different state (delete marked or unmarked, or have different field
+	values, or not existing). If there is such a version, then rec was
+	modified by the trx_id transaction, and it has an implicit x-lock on
+	rec. Note that if clust_rec itself would require rec to be in a
+	different state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+	rec_del = rec_get_deleted_flag(rec, comp);
+	trx = NULL;
+
+	version = clust_rec;
+
+	for (;;) {
+		rec_t*		prev_version;
+		ulint		vers_del;
+		row_ext_t*	ext;
+		trx_id_t	prev_trx_id;
+
+		mutex_exit(&kernel_mutex);
+
+		/* While we retrieve an earlier version of clust_rec, we
+		release the kernel mutex, because it may take time to access
+		the disk. After the release, we have to check if the trx_id
+		transaction is still active. We keep the semaphore in mtr on
+		the clust_rec page, so that no other transaction can update
+		it and get an implicit x-lock on rec. */
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+						  clust_index, clust_offsets,
+						  heap, &prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (prev_version == NULL) {
+			mutex_enter(&kernel_mutex);
+
+			if (!trx_is_active(trx_id)) {
+				/* Transaction no longer active: no
+				implicit x-lock */
+
+				break;
+			}
+
+			/* If the transaction is still active,
+			clust_rec must be a fresh insert, because no
+			previous version was found. */
+			ut_ad(err == DB_SUCCESS);
+
+			/* It was a freshly inserted version: there is an
+			implicit x-lock on rec */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL, ULINT_UNDEFINED, &heap);
+
+		vers_del = rec_get_deleted_flag(prev_version, comp);
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+						 clust_offsets);
+
+		/* If the trx_id and prev_trx_id are different and if
+		the prev_version is marked deleted then the
+		prev_trx_id must have already committed for the trx_id
+		to be able to modify the row. Therefore, prev_trx_id
+		cannot hold any implicit lock. */
+		if (vers_del && 0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+
+			mutex_enter(&kernel_mutex);
+			break;
+		}
+
+		/* The stack of versions is locked by mtr.  Thus, it
+		is safe to fetch the prefixes for externally stored
+		columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+				clust_offsets, NULL, &ext, heap);
+		entry = row_build_index_entry(row, ext, index, heap);
+		/* entry may be NULL if a record was inserted in place
+		of a deleted record, and the BLOB pointers of the new
+		record were not initialized yet.  But in that case,
+		prev_version should be NULL. */
+		ut_a(entry);
+
+		mutex_enter(&kernel_mutex);
+
+		if (!trx_is_active(trx_id)) {
+			/* Transaction no longer active: no implicit x-lock */
+
+			break;
+		}
+
+		/* If we get here, we know that the trx_id transaction is
+		still active and it has modified prev_version. Let us check
+		if prev_version would require rec to be in a different
+		state. */
+
+		/* The previous version of clust_rec must be
+		accessible, because the transaction is still active
+		and clust_rec was not a fresh insert. */
+		ut_ad(err == DB_SUCCESS);
+
+		/* We check if entry and rec are identified in the alphabetical
+		ordering */
+		if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+
+			/* It is possible that the row was updated so that the
+			secondary index record remained the same in
+			alphabetical ordering, but the field values changed
+			still. For example, 'abc' -> 'ABC'. Check also that. */
+
+			dtuple_set_types_binary(entry,
+						dtuple_get_n_fields(entry));
+			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+			/* The versions modified by the trx_id transaction end
+			to prev_version: no implicit x-lock */
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+exit_func:
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(trx);
+}
+
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return	TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+	trx_id_t	trx_id,	/*!< in: transaction id in the version */
+	mtr_t*		mtr)	/*!< in: mtr holding the latch on the
+				clustered index record; it will also
+				hold the latch on purge_view */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	if (trx_purge_update_undo_must_exist(trx_id)) {
+
+		/* A purge operation is not yet allowed to remove this
+		delete marked record */
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry and ientry are identified in
+the alphabetical ordering; exactly in this case we return TRUE.
+@return	TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+	ibool		also_curr,/*!< in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	const rec_t*	rec,	/*!< in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const dtuple_t*	ientry)	/*!< in: the secondary index entry */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	const dtuple_t*	row;
+	const dtuple_t*	entry;
+	ulint		err;
+	ulint		comp;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(!dict_table_is_comp(index->table) == !comp);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+
+	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+		row_ext_t*	ext;
+
+		/* The stack of versions is locked by mtr.
+		Thus, it is safe to fetch the prefixes for
+		externally stored columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, clust_offsets, NULL, &ext, heap);
+		entry = row_build_index_entry(row, ext, index, heap);
+
+		/* If entry == NULL, the record contains unset BLOB
+		pointers.  This must be a freshly inserted record.  If
+		this is called from
+		row_purge_remove_sec_if_poss_low(), the thread will
+		hold latches on the clustered index and the secondary
+		index.  Because the insert works in three steps:
+
+			(1) insert the record to clustered index
+			(2) store the BLOBs and update BLOB pointers
+			(3) insert records to secondary indexes
+
+		the purge thread can safely ignore freshly inserted
+		records and delete the secondary index record.  The
+		thread that inserted the new record will be inserting
+		the secondary index records. */
+
+		/* NOTE that we cannot do the comparison as binary
+		fields because the row is maybe being modified so that
+		the clustered index record has already been updated to
+		a different binary value in a char field, but the
+		collation identifies the old and new value anyway! */
+		if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+			mem_heap_free(heap);
+
+			return(TRUE);
+		}
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		err = trx_undo_prev_version_build(rec, mtr, version,
+						  clust_index, clust_offsets,
+						  heap, &prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (err != DB_SUCCESS || !prev_version) {
+			/* Versions end here */
+
+			mem_heap_free(heap);
+
+			return(FALSE);
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL, ULINT_UNDEFINED, &heap);
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, &ext, heap);
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+				mem_heap_free(heap);
+
+				return(TRUE);
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	read_view_t*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers)/*!< out, own: old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	ulint		err;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+	ut_ad(!read_view_sees_trx_id(view, trx_id));
+
+	rw_lock_s_lock(&(purge_sys->latch));
+	version = rec;
+
+	for (;;) {
+		mem_heap_t*	heap2	= heap;
+		trx_undo_rec_t* undo_rec;
+		roll_ptr_t	roll_ptr;
+		undo_no_t	undo_no;
+		heap = mem_heap_create(1024);
+
+		/* If we have high-granularity consistent read view and
+		creating transaction of the view is the same as trx_id in
+		the record we see this record only in the case when
+		undo_no of the record is < undo_no in the view. */
+
+		if (view->type == VIEW_HIGH_GRANULARITY
+		    && ut_dulint_cmp(view->creator_trx_id, trx_id) == 0) {
+
+			roll_ptr = row_get_rec_roll_ptr(version, index,
+							*offsets);
+			undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+			undo_no = trx_undo_rec_get_undo_no(undo_rec);
+			mem_heap_empty(heap);
+
+			if (ut_dulint_cmp(view->undo_no, undo_no) > 0) {
+				/* The view already sees this version: we can
+				copy it to in_heap and return */
+
+				buf = mem_heap_alloc(in_heap,
+						     rec_offs_size(*offsets));
+				*old_vers = rec_copy(buf, version, *offsets);
+				rec_offs_make_valid(*old_vers, index,
+						    *offsets);
+				err = DB_SUCCESS;
+
+				break;
+			}
+		}
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+						  *offsets, heap,
+						  &prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		*offsets = rec_get_offsets(prev_version, index, *offsets,
+					   ULINT_UNDEFINED, offset_heap);
+
+		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+		if (read_view_sees_trx_id(view, trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			*old_vers = rec_copy(buf, prev_version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+	mem_heap_free(heap);
+	rw_lock_s_unlock(&(purge_sys->latch));
+
+	return(err);
+}
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers)/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	const rec_t*	version;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	ulint		err;
+	trx_id_t	rec_trx_id	= ut_dulint_zero;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	rw_lock_s_lock(&(purge_sys->latch));
+	/* The S-latch on purge_sys prevents the purge view from
+	changing.  Thus, if we have an uncommitted transaction at
+	this point, then purge cannot remove its undo log even if
+	the transaction could commit now. */
+
+	version = rec;
+
+	for (;;) {
+		trx_t*		version_trx;
+		mem_heap_t*	heap2;
+		rec_t*		prev_version;
+		trx_id_t	version_trx_id;
+
+		version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+		if (rec == version) {
+			rec_trx_id = version_trx_id;
+		}
+
+		mutex_enter(&kernel_mutex);
+		version_trx = trx_get_on_id(version_trx_id);
+		mutex_exit(&kernel_mutex);
+
+		if (!version_trx
+		    || version_trx->conc_state == TRX_NOT_STARTED
+		    || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+
+			/* We found a version that belongs to a
+			committed transaction: return it. */
+
+			if (rec == version) {
+				*old_vers = rec;
+				err = DB_SUCCESS;
+				break;
+			}
+
+			/* We assume that a rolled-back transaction stays in
+			TRX_ACTIVE state until all the changes have been
+			rolled back and the transaction is removed from
+			the global list of transactions. */
+
+			if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) {
+				/* The transaction was committed while
+				we searched for earlier versions.
+				Return the current version as a
+				semi-consistent read. */
+
+				version = rec;
+				*offsets = rec_get_offsets(version,
+							   index, *offsets,
+							   ULINT_UNDEFINED,
+							   offset_heap);
+			}
+
+			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			*old_vers = rec_copy(buf, version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+						  *offsets, heap,
+						  &prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			break;
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		version = prev_version;
+		*offsets = rec_get_offsets(version, index, *offsets,
+					   ULINT_UNDEFINED, offset_heap);
+	}/* for (;;) */
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	rw_lock_s_unlock(&(purge_sys->latch));
+
+	return(err);
+}
author	Guilhem Bichot <guilhem@mysql.com>	2009-08-07 12:16:00 +0200
committer	Guilhem Bichot <guilhem@mysql.com>	2009-08-07 12:16:00 +0200
commit	7f2dd1570420a8fe096324db972ae3c775113a33 (patch)
tree	65b42f5cb11f29ea5b4414ff075ccafd48569ad6 /storage/innobase/row
parent	10080d547e012aac4feb566fd7b51bbb485a717b (diff)
download	mariadb-git-7f2dd1570420a8fe096324db972ae3c775113a33.tar.gz