summaryrefslogtreecommitdiff
path: root/storage/innobase/row
diff options
context:
space:
mode:
authorunknown <brian@zim.(none)>2005-04-26 19:07:13 -0700
committerunknown <brian@zim.(none)>2005-04-26 19:07:13 -0700
commita9da10f7a8e626aac0bd3a8d82b20c7b864c7061 (patch)
tree7028a9aade64c6143da00a4301627ce5d262c0f0 /storage/innobase/row
parenta2ed27af5291f778c1fc6e23cf4edc1bc36f0bed (diff)
parent25311ea4a5f83652959a0744d99a4eb51aa9d328 (diff)
downloadmariadb-git-a9da10f7a8e626aac0bd3a8d82b20c7b864c7061.tar.gz
Merge baker@bk-internal.mysql.com:/home/bk/mysql-5.1
into zim.(none):/home/brian/mysql/mysql-5.1 configure.in: Auto merged mysql-test/mysql-test-run.sh: Auto merged sql/Makefile.am: Auto merged sql/ha_innodb.cc: Auto merged sql/ha_myisam.cc: Auto merged sql/lock.cc: Auto merged storage/innobase/btr/btr0pcur.c: Auto merged storage/innobase/dict/dict0dict.c: Auto merged storage/innobase/dict/dict0load.c: Auto merged storage/innobase/fil/fil0fil.c: Auto merged storage/innobase/include/btr0pcur.h: Auto merged storage/innobase/include/btr0pcur.ic: Auto merged storage/innobase/include/dict0dict.h: Auto merged storage/innobase/include/dict0load.h: Auto merged storage/innobase/include/os0file.h: Auto merged storage/innobase/include/srv0srv.h: Auto merged storage/innobase/log/log0log.c: Auto merged storage/innobase/os/os0file.c: Auto merged storage/innobase/row/row0ins.c: Auto merged storage/innobase/row/row0mysql.c: Auto merged storage/innobase/row/row0sel.c: Auto merged storage/innobase/srv/srv0srv.c: Auto merged storage/innobase/srv/srv0start.c: Auto merged storage/myisam/mi_check.c: Auto merged storage/myisam/mi_create.c: Auto merged storage/myisam/mi_dynrec.c: Auto merged storage/myisam/mi_search.c: Auto merged storage/myisam/mi_write.c: Auto merged storage/myisam/myisamdef.h: Auto merged storage/myisam/myisampack.c: Auto merged storage/myisammrg/myrg_create.c: Auto merged storage/ndb/include/kernel/signaldata/BackupImpl.hpp: Auto merged storage/ndb/include/kernel/signaldata/BackupSignalData.hpp: Auto merged storage/ndb/include/kernel/signaldata/TuxMaint.hpp: Auto merged storage/ndb/include/mgmapi/mgmapi_config_parameters.h: Auto merged storage/ndb/include/ndbapi/NdbScanOperation.hpp: Auto merged storage/ndb/include/ndbapi/NdbTransaction.hpp: Auto merged storage/ndb/src/common/debugger/signaldata/BackupImpl.cpp: Auto merged storage/ndb/src/kernel/blocks/ERROR_codes.txt: Auto merged storage/ndb/src/kernel/blocks/backup/Backup.cpp: Auto merged storage/ndb/src/kernel/blocks/backup/Backup.hpp: Auto merged storage/ndb/src/kernel/blocks/backup/Backup.txt: Auto merged storage/ndb/src/kernel/blocks/backup/BackupInit.cpp: Auto merged storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp: Auto merged storage/ndb/src/kernel/blocks/dbacc/Dbacc.hpp: Auto merged storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp: Auto merged storage/ndb/src/kernel/main.cpp: Auto merged storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp: Auto merged storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/DbtupRoutines.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/DbtupTrigger.cpp: Auto merged storage/ndb/src/kernel/blocks/dbtup/Notes.txt: Auto merged storage/ndb/src/kernel/blocks/dbtux/DbtuxNode.cpp: Auto merged storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp: Auto merged storage/ndb/src/kernel/blocks/ndbfs/OpenFiles.hpp: Auto merged storage/ndb/src/mgmapi/mgmapi.cpp: Auto merged storage/ndb/src/mgmsrv/ConfigInfo.cpp: Auto merged storage/ndb/src/mgmsrv/MgmtSrvr.cpp: Auto merged storage/ndb/src/mgmsrv/MgmtSrvr.hpp: Auto merged storage/ndb/src/mgmsrv/MgmtSrvrGeneralSignalHandling.cpp: Auto merged storage/ndb/src/ndbapi/Ndb.cpp: Auto merged storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp: Auto merged storage/ndb/src/ndbapi/NdbIndexOperation.cpp: Auto merged storage/ndb/src/ndbapi/NdbOperationDefine.cpp: Auto merged storage/ndb/src/ndbapi/NdbOperationSearch.cpp: Auto merged storage/ndb/src/ndbapi/NdbScanOperation.cpp: Auto merged storage/ndb/src/ndbapi/NdbTransaction.cpp: Auto merged storage/ndb/src/ndbapi/Ndbif.cpp: Auto merged storage/ndb/src/ndbapi/Ndblist.cpp: Auto merged storage/ndb/src/ndbapi/TransporterFacade.cpp: Auto merged storage/ndb/src/ndbapi/ndberror.c: Auto merged storage/ndb/test/ndbapi/Makefile.am: Auto merged storage/ndb/test/ndbapi/testBackup.cpp: Auto merged storage/ndb/test/ndbapi/testIndex.cpp: Auto merged storage/ndb/test/ndbapi/testNodeRestart.cpp: Auto merged storage/ndb/test/ndbapi/testOIBasic.cpp: Auto merged storage/ndb/test/ndbapi/testOperations.cpp: Auto merged storage/ndb/test/run-test/daily-basic-tests.txt: Auto merged storage/ndb/test/run-test/daily-devel-tests.txt: Auto merged storage/ndb/test/src/NdbBackup.cpp: Auto merged storage/ndb/tools/desc.cpp: Auto merged
Diffstat (limited to 'storage/innobase/row')
-rw-r--r--storage/innobase/row/Makefile.am25
-rw-r--r--storage/innobase/row/makefilewin34
-rw-r--r--storage/innobase/row/row0ins.c2424
-rw-r--r--storage/innobase/row/row0mysql.c4085
-rw-r--r--storage/innobase/row/row0purge.c671
-rw-r--r--storage/innobase/row/row0row.c730
-rw-r--r--storage/innobase/row/row0sel.c4066
-rw-r--r--storage/innobase/row/row0uins.c308
-rw-r--r--storage/innobase/row/row0umod.c773
-rw-r--r--storage/innobase/row/row0undo.c350
-rw-r--r--storage/innobase/row/row0upd.c2035
-rw-r--r--storage/innobase/row/row0vers.c492
12 files changed, 15993 insertions, 0 deletions
diff --git a/storage/innobase/row/Makefile.am b/storage/innobase/row/Makefile.am
new file mode 100644
index 00000000000..bd09f9a237d
--- /dev/null
+++ b/storage/innobase/row/Makefile.am
@@ -0,0 +1,25 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+# & Innobase Oy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+include ../include/Makefile.i
+
+noinst_LIBRARIES = librow.a
+
+librow_a_SOURCES = row0ins.c row0mysql.c row0purge.c row0row.c row0sel.c\
+ row0uins.c row0umod.c row0undo.c row0upd.c row0vers.c
+
+EXTRA_PROGRAMS =
diff --git a/storage/innobase/row/makefilewin b/storage/innobase/row/makefilewin
new file mode 100644
index 00000000000..c17240c6119
--- /dev/null
+++ b/storage/innobase/row/makefilewin
@@ -0,0 +1,34 @@
+include ..\include\makefile.i
+
+row.lib: row0mysql.obj row0upd.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+ lib -out:..\libs\row.lib row0mysql.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+
+row0mysql.obj: row0mysql.c
+ $(CCOM) $(CFL) -c row0mysql.c
+
+row0ins.obj: row0ins.c
+ $(CCOM) $(CFL) -c row0ins.c
+
+row0sel.obj: row0sel.c
+ $(CCOM) $(CFL) -c row0sel.c
+
+row0upd.obj: row0upd.c
+ $(CCOM) $(CFL) -c row0upd.c
+
+row0undo.obj: row0undo.c
+ $(CCOM) $(CFL) -c row0undo.c
+
+row0purge.obj: row0purge.c
+ $(CCOM) $(CFL) -c row0purge.c
+
+row0row.obj: row0row.c
+ $(CCOM) $(CFL) -c row0row.c
+
+row0vers.obj: row0vers.c
+ $(CCOM) $(CFL) -c row0vers.c
+
+row0umod.obj: row0umod.c
+ $(CCOM) $(CFL) -c row0umod.c
+
+row0uins.obj: row0uins.c
+ $(CCOM) $(CFL) -c row0uins.c
diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c
new file mode 100644
index 00000000000..303fe5749bc
--- /dev/null
+++ b/storage/innobase/row/row0ins.c
@@ -0,0 +1,2424 @@
+/******************************************************
+Insert into a table
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+
+#define ROW_INS_PREV 1
+#define ROW_INS_NEXT 2
+
+
+/*********************************************************************
+This prototype is copied from /mysql/sql/ha_innodb.cc.
+Invalidates the MySQL query cache for the table.
+NOTE that the exact prototype of this function has to be in
+/innobase/row/row0ins.c! */
+extern
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /* in: transaction which modifies the table */
+ char* full_name, /* in: concatenation of database name, null
+ char '\0', table name, null char'\0';
+ NOTE that in Windows this is always
+ in LOWER CASE! */
+ ulint full_name_len); /* in: full name length where also the null
+ chars count */
+
+/**********************************************************************
+This function returns true if
+
+1) SQL-query in the current thread
+is either REPLACE or LOAD DATA INFILE REPLACE.
+
+2) SQL-query in the current thread
+is INSERT ON DUPLICATE KEY UPDATE.
+
+NOTE that /mysql/innobase/row/row0ins.c must contain the
+prototype for this function ! */
+
+ibool
+innobase_query_is_update(void);
+
+/*************************************************************************
+Creates an insert node struct. */
+
+ins_node_t*
+ins_node_create(
+/*============*/
+ /* out, own: insert node struct */
+ ulint ins_type, /* in: INS_VALUES, ... */
+ dict_table_t* table, /* in: table where to insert */
+ mem_heap_t* heap) /* in: mem heap where created */
+{
+ ins_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+ node->common.type = QUE_NODE_INSERT;
+
+ node->ins_type = ins_type;
+
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->table = table;
+ node->index = NULL;
+ node->entry = NULL;
+
+ node->select = NULL;
+
+ node->trx_id = ut_dulint_zero;
+
+ node->entry_sys_heap = mem_heap_create(128);
+
+ node->magic_n = INS_NODE_MAGIC_N;
+
+ return(node);
+}
+
+/***************************************************************
+Creates an entry template for each index of a table. */
+static
+void
+ins_node_create_entry_list(
+/*=======================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ dict_index_t* index;
+ dtuple_t* entry;
+
+ ut_ad(node->entry_sys_heap);
+
+ UT_LIST_INIT(node->entry_list);
+
+ index = dict_table_get_first_index(node->table);
+
+ while (index != NULL) {
+ entry = row_build_index_entry(node->row, index,
+ node->entry_sys_heap);
+ UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+ index = dict_table_get_next_index(index);
+ }
+}
+
+/*********************************************************************
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+ ins_node_t* node) /* in: insert node */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ mem_heap_t* heap;
+ dict_col_t* col;
+ dfield_t* dfield;
+ ulint len;
+ byte* ptr;
+
+ row = node->row;
+ table = node->table;
+ heap = node->entry_sys_heap;
+
+ ut_ad(row && table && heap);
+ ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+ /* 1. Allocate buffer for row id */
+
+ col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+ node->row_id_buf = ptr;
+
+ if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
+
+ /* 2. Fill in the dfield for mix id */
+
+ col = dict_table_get_sys_col(table, DATA_MIX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ len = mach_dulint_get_compressed_size(table->mix_id);
+ ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN);
+
+ mach_dulint_write_compressed(ptr, table->mix_id);
+ dfield_set_data(dfield, ptr, len);
+ }
+
+ /* 3. Allocate buffer for trx id */
+
+ col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+ node->trx_id_buf = ptr;
+
+ /* 4. Allocate buffer for roll ptr */
+
+ col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*************************************************************************
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /* in: insert node */
+ dtuple_t* row) /* in: new row (or first row) for the node */
+{
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->index = NULL;
+ node->entry = NULL;
+
+ node->row = row;
+
+ mem_heap_empty(node->entry_sys_heap);
+
+ /* Create templates for index entries */
+
+ ins_node_create_entry_list(node);
+
+ /* Allocate from entry_sys_heap buffers for sys fields */
+
+ row_ins_alloc_sys_fields(node);
+
+ /* As we allocated a new trx id buf, the trx id should be written
+ there again: */
+
+ node->trx_id = ut_dulint_zero;
+}
+
+/***********************************************************************
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads. */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+ /* out: DB_SUCCESS or error code */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /* in: B-tree cursor */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ big_rec_t* dummy_big_rec;
+ mem_heap_t* heap;
+ upd_t* update;
+ rec_t* rec;
+ ulint err;
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad((cursor->index->type & DICT_CLUSTERED) == 0);
+ ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp));
+
+ /* We know that in the alphabetical ordering, entry and rec are
+ identified. But in their binary form there may be differences if
+ there are char fields in them. Therefore we have to calculate the
+ difference. */
+
+ heap = mem_heap_create(1024);
+
+ update = row_upd_build_sec_rec_difference_binary(cursor->index,
+ entry, rec, thr_get_trx(thr), heap);
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try an optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+ update, 0, thr, mtr);
+ if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+ &dummy_big_rec, update, 0, thr, mtr);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************************
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads. */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+ /* out: DB_SUCCESS, DB_FAIL, or error code */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /* in: B-tree cursor */
+ big_rec_t** big_rec,/* out: possible big rec vector of fields
+ which have to be stored externally by the
+ caller */
+ dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ mem_heap_t* heap;
+ rec_t* rec;
+ upd_t* update;
+ ulint err;
+
+ ut_ad(cursor->index->type & DICT_CLUSTERED);
+
+ *big_rec = NULL;
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp));
+
+ heap = mem_heap_create(1024);
+
+ /* Build an update vector containing all the fields to be modified;
+ NOTE that this vector may NOT contain system columns trx_id or
+ roll_ptr */
+
+ update = row_upd_build_difference_binary(cursor->index, entry, ext_vec,
+ n_ext_vec, rec, thr_get_trx(thr), heap);
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+ mtr);
+ if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_update(0, cursor, big_rec, update,
+ 0, thr, mtr);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/*************************************************************************
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table. */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+ /* out: TRUE if an ancestor updates table */
+ que_node_t* node, /* in: node in a query graph */
+ dict_table_t* table) /* in: table */
+{
+ que_node_t* parent;
+ upd_node_t* upd_node;
+
+ parent = que_node_get_parent(node);
+
+ while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+
+ upd_node = parent;
+
+ if (upd_node->table == table && upd_node->is_delete == FALSE) {
+
+ return(TRUE);
+ }
+
+ parent = que_node_get_parent(parent);
+
+ ut_a(parent);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node. */
+static
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+ /* out: number of ancestors */
+ que_node_t* node) /* in: node in a query graph */
+{
+ que_node_t* parent;
+ ulint n_ancestors = 0;
+
+ parent = que_node_get_parent(node);
+
+ while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+ n_ancestors++;
+
+ parent = que_node_get_parent(parent);
+
+ ut_a(parent);
+ }
+
+ return(n_ancestors);
+}
+
+/**********************************************************************
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update. */
+static
+ulint
+row_ins_cascade_calc_update_vec(
+/*============================*/
+ /* out: number of fields in the
+ calculated update vector; the value
+ can also be 0 if no foreign key
+ fields changed; the returned value
+ is ULINT_UNDEFINED if the column
+ type in the child table is too short
+ to fit the new value in the parent
+ table: that means the update fails */
+ upd_node_t* node, /* in: update node of the parent
+ table */
+ dict_foreign_t* foreign, /* in: foreign key constraint whose
+ type is != 0 */
+ mem_heap_t* heap) /* in: memory heap to use as
+ temporary storage */
+{
+ upd_node_t* cascade = node->cascade_node;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index = foreign->foreign_index;
+ upd_t* update;
+ upd_field_t* ufield;
+ dict_table_t* parent_table;
+ dict_index_t* parent_index;
+ upd_t* parent_update;
+ upd_field_t* parent_ufield;
+ ulint n_fields_updated;
+ ulint parent_field_no;
+ dtype_t* type;
+ ulint i;
+ ulint j;
+
+ ut_a(node && foreign && cascade && table && index);
+
+ /* Calculate the appropriate update vector which will set the fields
+ in the child index record to the same value (possibly padded with
+ spaces if the column is a fixed length CHAR or FIXBINARY column) as
+ the referenced index record will get in the update. */
+
+ parent_table = node->table;
+ ut_a(parent_table == foreign->referenced_table);
+ parent_index = foreign->referenced_index;
+ parent_update = node->update;
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+ update->n_fields = foreign->n_fields;
+
+ n_fields_updated = 0;
+
+ for (i = 0; i < foreign->n_fields; i++) {
+
+ parent_field_no = dict_table_get_nth_col_pos(
+ parent_table,
+ dict_index_get_nth_col_no(
+ parent_index, i));
+
+ for (j = 0; j < parent_update->n_fields; j++) {
+ parent_ufield = parent_update->fields + j;
+
+ if (parent_ufield->field_no == parent_field_no) {
+
+ ulint fixed_size;
+
+ /* A field in the parent index record is
+ updated. Let us make the update vector
+ field for the child table. */
+
+ ufield = update->fields + n_fields_updated;
+
+ ufield->field_no =
+ dict_table_get_nth_col_pos(table,
+ dict_index_get_nth_col_no(index, i));
+ ufield->exp = NULL;
+
+ ufield->new_val = parent_ufield->new_val;
+
+ type = dict_index_get_nth_type(index, i);
+
+ /* Do not allow a NOT NULL column to be
+ updated as NULL */
+
+ if (ufield->new_val.len == UNIV_SQL_NULL
+ && (type->prtype & DATA_NOT_NULL)) {
+
+ return(ULINT_UNDEFINED);
+ }
+
+ /* If the new value would not fit in the
+ column, do not allow the update */
+
+ if (ufield->new_val.len != UNIV_SQL_NULL
+ && ufield->new_val.len
+ > dtype_get_len(type)) {
+
+ return(ULINT_UNDEFINED);
+ }
+
+ /* If the parent column type has a different
+ length than the child column type, we may
+ need to pad with spaces the new value of the
+ child column */
+
+ fixed_size = dtype_get_fixed_size(type);
+
+ /* TODO: pad in UCS-2 with 0x0020.
+ TODO: How does the special truncation of
+ UTF-8 CHAR cols affect this? */
+
+ if (fixed_size
+ && ufield->new_val.len != UNIV_SQL_NULL
+ && ufield->new_val.len < fixed_size) {
+
+ ufield->new_val.data =
+ mem_heap_alloc(heap,
+ fixed_size);
+ ufield->new_val.len = fixed_size;
+ ut_a(dtype_get_pad_char(type)
+ != ULINT_UNDEFINED);
+
+ memset(ufield->new_val.data,
+ (byte)dtype_get_pad_char(type),
+ fixed_size);
+ ut_memcpy(ufield->new_val.data,
+ parent_ufield->new_val.data,
+ parent_ufield->new_val.len);
+ }
+
+ ufield->extern_storage = FALSE;
+
+ n_fields_updated++;
+ }
+ }
+ }
+
+ update->n_fields = n_fields_updated;
+
+ return(n_fields_updated);
+}
+
+/*************************************************************************
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+ const char* errstr, /* in: error string from the viewpoint
+ of the parent table */
+ que_thr_t* thr, /* in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /* in: foreign key constraint */
+ rec_t* rec, /* in: a matching index record in the
+ child table */
+ dtuple_t* entry) /* in: index entry in the parent
+ table */
+{
+ FILE* ef = dict_foreign_err_file;
+ trx_t* trx = thr_get_trx(thr);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Transaction:\n", ef);
+ trx_print(ef, trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign);
+ putc('\n', ef);
+ fputs(errstr, ef);
+ fputs(" in parent table, in index ", ef);
+ ut_print_name(ef, trx, foreign->referenced_index->name);
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in child table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(", in index ", ef);
+ ut_print_name(ef, trx, foreign->foreign_index->name);
+ if (rec) {
+ fputs(", there is a record:\n", ef);
+ rec_print(ef, rec, foreign->foreign_index);
+ } else {
+ fputs(", the record is not available\n", ef);
+ }
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*************************************************************************
+Reports a foreign key error to dict_foreign_err_buf when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+ trx_t* trx, /* in: transaction */
+ dict_foreign_t* foreign, /* in: foreign key constraint */
+ rec_t* rec, /* in: a record in the parent table:
+ it does not match entry because we
+ have an error! */
+ dtuple_t* entry) /* in: index entry to insert in the
+ child table */
+{
+ FILE* ef = dict_foreign_err_file;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Transaction:\n", ef);
+ trx_print(ef, trx);
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign);
+ fputs("\nTrying to add in child table, in index ", ef);
+ ut_print_name(ef, trx, foreign->foreign_index->name);
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in parent table ", ef);
+ ut_print_name(ef, trx, foreign->referenced_table_name);
+ fputs(", in index ", ef);
+ ut_print_name(ef, trx, foreign->referenced_index->name);
+ fputs(",\nthe closest match we can find is record:\n", ef);
+ if (rec && page_rec_is_supremum(rec)) {
+ /* If the cursor ended on a supremum record, it is better
+ to report the previous record in the error message, so that
+ the user gets a more descriptive error message. */
+ rec = page_rec_get_prev(rec);
+ }
+
+ if (rec) {
+ rec_print(ef, rec, foreign->foreign_index);
+ }
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*************************************************************************
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+ que_thr_t* thr, /* in: query thread whose run_node
+ is an update node */
+ const char* name) /* in: table name prefixed with
+ database name and a '/' character */
+{
+ char* buf;
+ char* ptr;
+ ulint len = strlen(name) + 1;
+
+ buf = mem_strdupl(name, len);
+
+ ptr = strchr(buf, '/');
+ ut_a(ptr);
+ *ptr = '\0';
+
+ /* We call a function in ha_innodb.cc */
+#ifndef UNIV_HOTBACKUP
+ innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
+#endif
+ mem_free(buf);
+}
+
+/*************************************************************************
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT. */
+static
+ulint
+row_ins_foreign_check_on_constraint(
+/*================================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ or error code */
+ que_thr_t* thr, /* in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /* in: foreign key constraint whose
+ type is != 0 */
+ btr_pcur_t* pcur, /* in: cursor placed on a matching
+ index record in the child table */
+ dtuple_t* entry, /* in: index entry in the parent
+ table */
+ mtr_t* mtr) /* in: mtr holding the latch of pcur
+ page */
+{
+ upd_node_t* node;
+ upd_node_t* cascade;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ mem_heap_t* upd_vec_heap = NULL;
+ rec_t* rec;
+ rec_t* clust_rec;
+ upd_t* update;
+ ulint n_to_update;
+ ulint err;
+ ulint i;
+ trx_t* trx;
+ mem_heap_t* tmp_heap = NULL;
+
+ ut_a(thr && foreign && pcur && mtr);
+
+ trx = thr_get_trx(thr);
+
+ /* Since we are going to delete or update a row, we have to invalidate
+ the MySQL query cache for table. A deadlock of threads is not possible
+ here because the caller of this function does not hold any latches with
+ the sync0sync.h rank above the kernel mutex. The query cache mutex has
+ a rank just above the kernel mutex. */
+
+ row_ins_invalidate_query_cache(thr, table->name);
+
+ node = thr->run_node;
+
+ if (node->is_delete && 0 == (foreign->type &
+ (DICT_FOREIGN_ON_DELETE_CASCADE
+ | DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+ row_ins_foreign_report_err("Trying to delete",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ return(DB_ROW_IS_REFERENCED);
+ }
+
+ if (!node->is_delete && 0 == (foreign->type &
+ (DICT_FOREIGN_ON_UPDATE_CASCADE
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+ /* This is an UPDATE */
+
+ row_ins_foreign_report_err("Trying to update",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ return(DB_ROW_IS_REFERENCED);
+ }
+
+ if (node->cascade_node == NULL) {
+ /* Extend our query graph by creating a child to current
+ update node. The child is used in the cascade or set null
+ operation. */
+
+ node->cascade_heap = mem_heap_create(128);
+ node->cascade_node = row_create_update_node_for_mysql(
+ table, node->cascade_heap);
+ que_node_set_parent(node->cascade_node, node);
+ }
+
+ /* Initialize cascade_node to do the operation we want. Note that we
+ use the SAME cascade node to do all foreign key operations of the
+ SQL DELETE: the table of the cascade node may change if there are
+ several child tables to the table where the delete is done! */
+
+ cascade = node->cascade_node;
+
+ cascade->table = table;
+
+ cascade->foreign = foreign;
+
+ if (node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+ cascade->is_delete = TRUE;
+ } else {
+ cascade->is_delete = FALSE;
+
+ if (foreign->n_fields > cascade->update_n_fields) {
+ /* We have to make the update vector longer */
+
+ cascade->update = upd_create(foreign->n_fields,
+ node->cascade_heap);
+ cascade->update_n_fields = foreign->n_fields;
+ }
+ }
+
+ /* We do not allow cyclic cascaded updating (DELETE is allowed,
+ but not UPDATE) of the same table, as this can lead to an infinite
+ cycle. Check that we are not updating the same table which is
+ already being modified in this cascade chain. We have to check
+ this also because the modification of the indexes of a 'parent'
+ table may still be incomplete, and we must avoid seeing the indexes
+ of the parent table in an inconsistent state! */
+
+ if (!cascade->is_delete
+ && row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+ /* We do not know if this would break foreign key
+ constraints, but play safe and return an error */
+
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+"Trying an update, possibly causing a cyclic cascaded update\n"
+"in the child table,", thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ if (row_ins_cascade_n_ancestors(cascade) >= 15) {
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+"Trying a too deep cascaded delete or update\n",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ index = btr_pcur_get_btr_cur(pcur)->index;
+
+ ut_a(index == foreign->foreign_index);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (index->type & DICT_CLUSTERED) {
+ /* pcur is already positioned in the clustered index of
+ the child table */
+
+ clust_index = index;
+ clust_rec = rec;
+ } else {
+ /* We have to look for the record in the clustered index
+ in the child table */
+
+ clust_index = dict_table_get_first_index(table);
+
+ tmp_heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+ tmp_heap);
+ btr_pcur_open_with_no_init(clust_index, ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ cascade->pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(cascade->pcur);
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(cascade->pcur)
+ < dict_index_get_n_unique(clust_index)) {
+
+ fputs(
+ "InnoDB: error in cascade of a foreign key op\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ fputs("\n"
+ "InnoDB: clustered record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ /* Set an X-lock on the row to delete or update in the child table */
+
+ err = lock_table(0, table, LOCK_IX, thr);
+
+ if (err == DB_SUCCESS) {
+ /* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+ we already have a normal shared lock on the appropriate
+ gap if the search criterion was not unique */
+
+ err = lock_clust_rec_read_check_and_lock_alt(0, clust_rec,
+ clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto nonstandard_exit_func;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, table->comp)) {
+ /* This can happen if there is a circular reference of
+ rows such that cascading delete comes to delete a row
+ already in the process of being delete marked */
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+
+ if ((node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL))
+ || (!node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+ /* Build the appropriate update vector which sets
+ foreign->n_fields first fields in rec to SQL NULL */
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+ update->n_fields = foreign->n_fields;
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ (update->fields + i)->field_no
+ = dict_table_get_nth_col_pos(table,
+ dict_index_get_nth_col_no(index, i));
+ (update->fields + i)->exp = NULL;
+ (update->fields + i)->new_val.len = UNIV_SQL_NULL;
+ (update->fields + i)->new_val.data = NULL;
+ (update->fields + i)->extern_storage = FALSE;
+ }
+ }
+
+ if (!node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+ /* Build the appropriate update vector which sets changing
+ foreign->n_fields first fields in rec to new values */
+
+ upd_vec_heap = mem_heap_create(256);
+
+ n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
+ upd_vec_heap);
+ if (n_to_update == ULINT_UNDEFINED) {
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+"Trying a cascaded update where the updated value in the child\n"
+"table would not fit in the length of the column, or the value would\n"
+"be NULL and the column is declared as not NULL in the child table,",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ if (cascade->update->n_fields == 0) {
+
+ /* The update does not change any columns referred
+ to in this foreign key constraint: no need to do
+ anything */
+
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ /* Store pcur position and initialize or store the cascade node
+ pcur stored position */
+
+ btr_pcur_store_position(pcur, mtr);
+
+ if (index == clust_index) {
+ btr_pcur_copy_stored_position(cascade->pcur, pcur);
+ } else {
+ btr_pcur_store_position(cascade->pcur, mtr);
+ }
+
+ mtr_commit(mtr);
+
+ ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+ cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ err = row_update_cascade_for_mysql(thr, cascade,
+ foreign->foreign_table);
+
+ if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
+ fprintf(stderr,
+"InnoDB: error: table %s has the counter 0 though there is\n"
+"InnoDB: a FOREIGN KEY check running on it.\n",
+ foreign->foreign_table->name);
+ }
+
+ /* Release the data dictionary latch for a while, so that we do not
+ starve other threads from doing CREATE TABLE etc. if we have a huge
+ cascaded operation running. The counter n_foreign_key_checks_running
+ will prevent other users from dropping or ALTERing the table when we
+ release the latch. */
+
+ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+ row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+ mtr_start(mtr);
+
+ /* Restore pcur position */
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ if (upd_vec_heap) {
+ mem_heap_free(upd_vec_heap);
+ }
+
+ return(err);
+
+nonstandard_exit_func:
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ if (upd_vec_heap) {
+ mem_heap_free(upd_vec_heap);
+ }
+
+ btr_pcur_store_position(pcur, mtr);
+
+ mtr_commit(mtr);
+ mtr_start(mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ return(err);
+}
+
+/*************************************************************************
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints. */
+static
+ulint
+row_ins_set_shared_rec_lock(
+/*========================*/
+ /* out: DB_SUCCESS or error code */
+ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: index */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (index->type & DICT_CLUSTERED) {
+ err = lock_clust_rec_read_check_and_lock(0,
+ rec, index, offsets, LOCK_S, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(0,
+ rec, index, offsets, LOCK_S, type, thr);
+ }
+
+ return(err);
+}
+
+/*************************************************************************
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records */
+static
+ulint
+row_ins_set_exclusive_rec_lock(
+/*============================*/
+ /* out: DB_SUCCESS or error code */
+ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: index */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (index->type & DICT_CLUSTERED) {
+ err = lock_clust_rec_read_check_and_lock(0,
+ rec, index, offsets, LOCK_X, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(0,
+ rec, index, offsets, LOCK_X, type, thr);
+ }
+
+ return(err);
+}
+
+/*******************************************************************
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_operation_lock. */
+
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+ /* out: DB_SUCCESS,
+ DB_NO_REFERENCED_ROW,
+ or DB_ROW_IS_REFERENCED */
+ ibool check_ref,/* in: TRUE if we want to check that
+ the referenced table is ok, FALSE if we
+ want to to check the foreign key table */
+ dict_foreign_t* foreign,/* in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /* in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /* in: index entry for index */
+ que_thr_t* thr) /* in: query thread */
+{
+ upd_node_t* upd_node;
+ dict_table_t* check_table;
+ dict_index_t* check_index;
+ ulint n_fields_cmp;
+ rec_t* rec;
+ btr_pcur_t pcur;
+ ibool moved;
+ int cmp;
+ ulint err;
+ ulint i;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+run_again:
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ err = DB_SUCCESS;
+
+ if (trx->check_foreigns == FALSE) {
+ /* The user has suppressed foreign key checks currently for
+ this session */
+ goto exit_func;
+ }
+
+ /* If any of the foreign key fields in entry is SQL NULL, we
+ suppress the foreign key check: this is compatible with Oracle,
+ for example */
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ goto exit_func;
+ }
+ }
+
+ if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+ upd_node = thr->run_node;
+
+ if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
+ /* If a cascaded update is done as defined by a
+ foreign key constraint, do not check that
+ constraint for the child row. In ON UPDATE CASCADE
+ the update of the parent row is only half done when
+ we come here: if we would check the constraint here
+ for the child row it would fail.
+
+ A QUESTION remains: if in the child table there are
+ several constraints which refer to the same parent
+ table, we should merge all updates to the child as
+ one update? And the updates can be contradictory!
+ Currently we just perform the update associated
+ with each foreign key constraint, one after
+ another, and the user has problems predicting in
+ which order they are performed. */
+
+ goto exit_func;
+ }
+ }
+
+ if (check_ref) {
+ check_table = foreign->referenced_table;
+ check_index = foreign->referenced_index;
+ } else {
+ check_table = foreign->foreign_table;
+ check_index = foreign->foreign_index;
+ }
+
+ if (check_table == NULL || check_table->ibd_file_missing) {
+ if (check_ref) {
+ FILE* ef = dict_foreign_err_file;
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Transaction:\n", ef);
+ trx_print(ef, trx);
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ dict_print_info_on_foreign_key_in_create_format(ef,
+ trx, foreign);
+ fputs("\nTrying to add to index ", ef);
+ ut_print_name(ef, trx, foreign->foreign_index->name);
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ fputs("\nBut the parent table ", ef);
+ ut_print_name(ef, trx, foreign->referenced_table_name);
+ fputs("\nor its .ibd file does not currently exist!\n", ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ err = DB_NO_REFERENCED_ROW;
+ }
+
+ goto exit_func;
+ }
+
+ ut_a(check_table && check_index);
+
+ if (check_table != table) {
+ /* We already have a LOCK_IX on table, but not necessarily
+ on check_table */
+
+ err = lock_table(0, check_table, LOCK_IS, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto do_possible_lock_wait;
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+ btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ /* Scan index records and check if there is a matching record */
+
+ for (;;) {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ goto next_rec;
+ }
+
+ offsets = rec_get_offsets(rec, check_index,
+ offsets, ULINT_UNDEFINED, &heap);
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec,
+ check_index, offsets, thr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+
+ goto next_rec;
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+ if (cmp == 0) {
+ if (rec_get_deleted_flag(rec,
+ rec_offs_comp(offsets))) {
+ err = row_ins_set_shared_rec_lock(
+ LOCK_ORDINARY, rec,
+ check_index, offsets, thr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+ } else {
+ /* Found a matching record. Lock only
+ a record because we can allow inserts
+ into gaps */
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP, rec,
+ check_index, offsets, thr);
+
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+
+ if (check_ref) {
+ err = DB_SUCCESS;
+
+ break;
+ } else if (foreign->type != 0) {
+ /* There is an ON UPDATE or ON DELETE
+ condition: check them in a separate
+ function */
+
+ err =
+ row_ins_foreign_check_on_constraint(
+ thr, foreign, &pcur, entry,
+ &mtr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+ } else {
+ row_ins_foreign_report_err(
+ "Trying to delete or update",
+ thr, foreign, rec, entry);
+
+ err = DB_ROW_IS_REFERENCED;
+ break;
+ }
+ }
+ }
+
+ if (cmp < 0) {
+ err = row_ins_set_shared_rec_lock(LOCK_GAP,
+ rec, check_index, offsets, thr);
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+
+ if (check_ref) {
+ err = DB_NO_REFERENCED_ROW;
+ row_ins_foreign_report_add_err(
+ trx, foreign, rec, entry);
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ break;
+ }
+
+ ut_a(cmp == 0);
+next_rec:
+ moved = btr_pcur_move_to_next(&pcur, &mtr);
+
+ if (!moved) {
+ if (check_ref) {
+ rec = btr_pcur_get_rec(&pcur);
+ row_ins_foreign_report_add_err(
+ trx, foreign, rec, entry);
+ err = DB_NO_REFERENCED_ROW;
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ break;
+ }
+ }
+
+ btr_pcur_close(&pcur);
+
+ mtr_commit(&mtr);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+ if (err == DB_LOCK_WAIT) {
+ trx->error_state = err;
+
+ que_thr_stop_for_mysql(thr);
+
+ srv_suspend_mysql_thread(thr);
+
+ if (trx->error_state == DB_SUCCESS) {
+
+ goto run_again;
+ }
+
+ err = trx->error_state;
+ }
+
+exit_func:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*******************************************************************
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint. */
+static
+ulint
+row_ins_check_foreign_constraints(
+/*==============================*/
+ /* out: DB_SUCCESS or error code */
+ dict_table_t* table, /* in: table */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry for index */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_foreign_t* foreign;
+ ulint err;
+ trx_t* trx;
+ ibool got_s_lock = FALSE;
+
+ trx = thr_get_trx(thr);
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign) {
+ if (foreign->foreign_index == index) {
+
+ if (foreign->referenced_table == NULL) {
+ dict_table_get(foreign->referenced_table_name,
+ trx);
+ }
+
+ if (0 == trx->dict_operation_lock_mode) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ if (foreign->referenced_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ (foreign->referenced_table
+ ->n_foreign_key_checks_running)++;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_operation_lock temporarily!
+ But the counter on the table protects the referenced
+ table from being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(TRUE, foreign,
+ table, entry, thr);
+
+ if (foreign->referenced_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ ut_a(foreign->referenced_table
+ ->n_foreign_key_checks_running > 0);
+ (foreign->referenced_table
+ ->n_foreign_key_checks_running)--;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+ /* out: TRUE if error */
+ rec_t* rec, /* in: user record; NOTE that we assume
+ that the caller already has a record lock on
+ the record! */
+ dtuple_t* entry, /* in: entry to insert */
+ dict_index_t* index, /* in: index */
+ const ulint* offsets)/* in: rec_get_offsets(rec, index) */
+{
+ ulint matched_fields;
+ ulint matched_bytes;
+ ulint n_unique;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ n_unique = dict_index_get_n_unique(index);
+
+ matched_fields = 0;
+ matched_bytes = 0;
+
+ cmp_dtuple_rec_with_match(entry, rec, offsets,
+ &matched_fields, &matched_bytes);
+
+ if (matched_fields < n_unique) {
+
+ return(FALSE);
+ }
+
+ /* In a unique secondary index we allow equal key values if they
+ contain SQL NULLs */
+
+ if (!(index->type & DICT_CLUSTERED)) {
+
+ for (i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ return(FALSE);
+ }
+ }
+ }
+
+ if (!rec_get_deleted_flag(rec, index->table->comp)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records. */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+ /* out: DB_SUCCESS, DB_DUPLICATE_KEY, or
+ DB_LOCK_WAIT */
+ dict_index_t* index, /* in: non-clustered unique index */
+ dtuple_t* entry, /* in: index entry */
+ que_thr_t* thr) /* in: query thread */
+{
+#ifndef UNIV_HOTBACKUP
+ ulint n_unique;
+ ulint i;
+ int cmp;
+ ulint n_fields_cmp;
+ rec_t* rec;
+ btr_pcur_t pcur;
+ ulint err = DB_SUCCESS;
+ ibool moved;
+ mtr_t mtr;
+ trx_t* trx;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ n_unique = dict_index_get_n_unique(index);
+
+ /* If the secondary index is unique, but one of the fields in the
+ n_unique first fields is NULL, a unique key violation cannot occur,
+ since we define NULL != NULL in this case */
+
+ for (i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ return(DB_SUCCESS);
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+
+ btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ /* Scan index records and check if there is a duplicate */
+
+ for (;;) {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ goto next_rec;
+ }
+
+ /* Try to place a lock on the index record */
+
+ trx = thr_get_trx(thr);
+ ut_ad(trx);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (innobase_query_is_update()) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(LOCK_ORDINARY,
+ rec, index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY,
+ rec, index, offsets, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ goto next_rec;
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+ if (cmp == 0) {
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ index, offsets)) {
+ err = DB_DUPLICATE_KEY;
+
+ thr_get_trx(thr)->error_info = index;
+
+ break;
+ }
+ }
+
+ if (cmp < 0) {
+ break;
+ }
+
+ ut_a(cmp == 0);
+next_rec:
+ moved = btr_pcur_move_to_next(&pcur, &mtr);
+
+ if (!moved) {
+ break;
+ }
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ mtr_commit(&mtr);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+ return(err);
+#else /* UNIV_HOTBACKUP */
+ /* This function depends on MySQL code that is not included in
+ InnoDB Hot Backup builds. Besides, this function should never
+ be called in InnoDB Hot Backup. */
+ ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/*******************************************************************
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index! */
+static
+ulint
+row_ins_duplicate_error_in_clust(
+/*=============================*/
+ /* out: DB_SUCCESS if no error,
+ DB_DUPLICATE_KEY if error, DB_LOCK_WAIT if we
+ have to wait for a lock on a possible
+ duplicate record */
+ btr_cur_t* cursor, /* in: B-tree cursor */
+ dtuple_t* entry, /* in: entry to insert */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+#ifndef UNIV_HOTBACKUP
+ ulint err;
+ rec_t* rec;
+ page_t* page;
+ ulint n_unique;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t*heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ UT_NOT_USED(mtr);
+
+ ut_a(cursor->index->type & DICT_CLUSTERED);
+ ut_ad(cursor->index->type & DICT_UNIQUE);
+
+ /* NOTE: For unique non-clustered indexes there may be any number
+ of delete marked records with the same value for the non-clustered
+ index key (remember multiversioning), and which differ only in
+ the row refererence part of the index record, containing the
+ clustered index key fields. For such a secondary index record,
+ to avoid race condition, we must FIRST do the insertion and after
+ that check that the uniqueness condition is not breached! */
+
+ /* NOTE: A problem is that in the B-tree node pointers on an
+ upper level may match more to the entry than the actual existing
+ user records on the leaf level. So, even if low_match would suggest
+ that a duplicate key violation may occur, this may not be the case. */
+
+ n_unique = dict_index_get_n_unique(cursor->index);
+
+ if (cursor->low_match >= n_unique) {
+
+ rec = btr_cur_get_rec(cursor);
+ page = buf_frame_align(rec);
+
+ if (rec != page_get_infimum_rec(page)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ /* We set a lock on the possible duplicate: this
+ is needed in logical logging of MySQL to make
+ sure that in roll-forward we get the same duplicate
+ errors as in original execution */
+
+ if (innobase_query_is_update()) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,rec,cursor->index,
+ offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,rec, cursor->index,
+ offsets, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ cursor->index, offsets)) {
+ trx->error_info = cursor->index;
+ err = DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+ }
+ }
+
+ if (cursor->up_match >= n_unique) {
+
+ rec = page_rec_get_next(btr_cur_get_rec(cursor));
+ page = buf_frame_align(rec);
+
+ if (rec != page_get_supremum_rec(page)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (innobase_query_is_update()) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP, rec,
+ cursor->index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP, rec,
+ cursor->index, offsets, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ cursor->index, offsets)) {
+ trx->error_info = cursor->index;
+ err = DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+ mem_heap_free(heap);
+ }
+
+ ut_a(!(cursor->index->type & DICT_CLUSTERED));
+ /* This should never happen */
+ }
+
+ err = DB_SUCCESS;
+func_exit:
+ return(err);
+#else /* UNIV_HOTBACKUP */
+ /* This function depends on MySQL code that is not included in
+ InnoDB Hot Backup builds. Besides, this function should never
+ be called in InnoDB Hot Backup. */
+ ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/*******************************************************************
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal. */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+ /* out: 0 if no update, ROW_INS_PREV if
+ previous should be updated; currently we
+ do the search so that only the low_match
+ record can match enough to the search tuple,
+ not the next record */
+ btr_cur_t* cursor) /* in: B-tree cursor */
+{
+ ulint enough_match;
+ rec_t* rec;
+ page_t* page;
+
+ /* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+ pointers on upper levels of the B-tree may match more to entry than
+ to actual user records on the leaf level, we have to check if the
+ candidate record is actually a user record. In a clustered index
+ node pointers contain index->n_unique first fields, and in the case
+ of a secondary index, all fields of the index. */
+
+ enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+
+ if (cursor->low_match >= enough_match) {
+
+ rec = btr_cur_get_rec(cursor);
+ page = buf_frame_align(rec);
+
+ if (rec != page_get_infimum_rec(page)) {
+
+ return(ROW_INS_PREV);
+ }
+ }
+
+ return(0);
+}
+
+/*******************************************************************
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index. */
+
+ulint
+row_ins_index_entry_low(
+/*====================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL
+ if pessimistic retry needed, or error code */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
+ que_thr_t* thr) /* in: query thread */
+{
+ btr_cur_t cursor;
+ ulint ignore_sec_unique = 0;
+ ulint modify = 0; /* remove warning */
+ rec_t* insert_rec;
+ rec_t* rec;
+ rec_t* first_rec;
+ ulint err;
+ ulint n_unique;
+ big_rec_t* big_rec = NULL;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ log_free_check();
+
+ mtr_start(&mtr);
+
+ cursor.thr = thr;
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+
+ if (!(thr_get_trx(thr)->check_unique_secondary)) {
+ ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE;
+ }
+
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ mode | BTR_INSERT | ignore_sec_unique,
+ &cursor, 0, &mtr);
+
+ if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+ /* The insertion was made to the insert buffer already during
+ the search: we are done */
+
+ err = DB_SUCCESS;
+
+ goto function_exit;
+ }
+
+ first_rec = page_rec_get_next(page_get_infimum_rec(
+ buf_frame_align(btr_cur_get_rec(&cursor))));
+
+ if (!page_rec_is_supremum(first_rec)) {
+ ut_a(rec_get_n_fields(first_rec, index)
+ == dtuple_get_n_fields(entry));
+ }
+
+ n_unique = dict_index_get_n_unique(index);
+
+ if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique
+ || cursor.low_match >= n_unique)) {
+
+ if (index->type & DICT_CLUSTERED) {
+ /* Note that the following may return also
+ DB_LOCK_WAIT */
+
+ err = row_ins_duplicate_error_in_clust(&cursor,
+ entry, thr, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+ } else {
+ mtr_commit(&mtr);
+ err = row_ins_scan_sec_index_for_duplicate(index,
+ entry, thr);
+ mtr_start(&mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ /* We did not find a duplicate and we have now
+ locked with s-locks the necessary records to
+ prevent any insertion of a duplicate by another
+ transaction. Let us now reposition the cursor and
+ continue the insertion. */
+
+ btr_cur_search_to_nth_level(index, 0, entry,
+ PAGE_CUR_LE, mode | BTR_INSERT,
+ &cursor, 0, &mtr);
+ }
+ }
+
+ modify = row_ins_must_modify(&cursor);
+
+ if (modify != 0) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+
+ if (modify == ROW_INS_NEXT) {
+ rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+ btr_cur_position(index, rec, &cursor);
+ }
+
+ if (index->type & DICT_CLUSTERED) {
+ err = row_ins_clust_index_entry_by_modify(mode,
+ &cursor, &big_rec,
+ entry,
+ ext_vec, n_ext_vec,
+ thr, &mtr);
+ } else {
+ err = row_ins_sec_index_entry_by_modify(mode, &cursor,
+ entry,
+ thr, &mtr);
+ }
+
+ } else {
+ if (mode == BTR_MODIFY_LEAF) {
+ err = btr_cur_optimistic_insert(0, &cursor, entry,
+ &insert_rec, &big_rec, thr, &mtr);
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_insert(0, &cursor, entry,
+ &insert_rec, &big_rec, thr, &mtr);
+ }
+
+ if (err == DB_SUCCESS) {
+ if (ext_vec) {
+ rec_set_field_extern_bits(insert_rec, index,
+ ext_vec, n_ext_vec, &mtr);
+ }
+ }
+ }
+
+function_exit:
+ mtr_commit(&mtr);
+
+ if (big_rec) {
+ rec_t* rec;
+ mtr_start(&mtr);
+
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE, &cursor, 0, &mtr);
+ rec = btr_cur_get_rec(&cursor);
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ err = btr_store_big_rec_extern_fields(index, rec,
+ offsets, big_rec, &mtr);
+
+ if (modify) {
+ dtuple_big_rec_free(big_rec);
+ } else {
+ dtuple_convert_back_big_rec(index, entry, big_rec);
+ }
+
+ mtr_commit(&mtr);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*******************************************************************
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record. */
+
+ulint
+row_ins_index_entry(
+/*================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DUPLICATE_KEY, or some other error code */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ if (UT_LIST_GET_FIRST(index->table->foreign_list)) {
+ err = row_ins_check_foreign_constraints(index->table, index,
+ entry, thr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ }
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
+ ext_vec, n_ext_vec, thr);
+ if (err != DB_FAIL) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+
+ err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
+ ext_vec, n_ext_vec, thr);
+ return(err);
+}
+
+/***************************************************************
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+static
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to make */
+ dtuple_t* row) /* in: row */
+{
+ dict_field_t* ind_field;
+ dfield_t* field;
+ dfield_t* row_field;
+ ulint n_fields;
+ ulint i;
+ dtype_t* cur_type;
+
+ ut_ad(entry && row);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(entry, i);
+ ind_field = dict_index_get_nth_field(index, i);
+
+ row_field = dtuple_get_nth_field(row, ind_field->col->ind);
+
+ /* Check column prefix indexes */
+ if (ind_field->prefix_len > 0
+ && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+ cur_type = dict_col_get_type(
+ dict_field_get_col(ind_field));
+
+ field->len = dtype_get_at_most_n_mbchars(cur_type,
+ ind_field->prefix_len,
+ dfield_get_len(row_field), row_field->data);
+ } else {
+ field->len = row_field->len;
+ }
+
+ field->data = row_field->data;
+ }
+}
+
+/***************************************************************
+Inserts a single index entry to the table. */
+static
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ ins_node_t* node, /* in: row insert node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(dtuple_check_typed(node->row));
+
+ row_ins_index_entry_set_vals(node->index, node->entry, node->row);
+
+ ut_ad(dtuple_check_typed(node->entry));
+
+ err = row_ins_index_entry(node->index, node->entry, NULL, 0, thr);
+
+ return(err);
+}
+
+/***************************************************************
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ dulint row_id;
+
+ ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+ if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) {
+
+ /* No row id is stored if the clustered index is unique */
+
+ return;
+ }
+
+ /* Fill in row id value to row */
+
+ row_id = dict_sys_get_new_row_id();
+
+ dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***************************************************************
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->values_list;
+
+ while (list_node) {
+ eval_exp(list_node);
+
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***************************************************************
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->select->select_list;
+
+ while (list_node) {
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***************************************************************
+Inserts a row to a table. */
+
+ulint
+row_ins(
+/*====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ ins_node_t* node, /* in: row insert node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(node && thr);
+
+ if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+ row_ins_alloc_row_id_step(node);
+
+ node->index = dict_table_get_first_index(node->table);
+ node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+ if (node->ins_type == INS_SEARCHED) {
+
+ row_ins_get_row_from_select(node);
+
+ } else if (node->ins_type == INS_VALUES) {
+
+ row_ins_get_row_from_values(node);
+ }
+
+ node->state = INS_NODE_INSERT_ENTRIES;
+ }
+
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+ while (node->index != NULL) {
+ err = row_ins_index_entry_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+ }
+
+ ut_ad(node->entry == NULL);
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_ins_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ ins_node_t* node;
+ que_node_t* parent;
+ sel_node_t* sel_node;
+ trx_t* trx;
+ ulint err;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ trx_start_if_not_started(trx);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+ parent = que_node_get_parent(node);
+ sel_node = node->select;
+
+ if (thr->prev_node == parent) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ }
+
+ /* If this is the first time this node is executed (or when
+ execution resumes after wait for the table IX lock), set an
+ IX lock on the table and reset the possible select node. */
+
+ if (node->state == INS_NODE_SET_IX_LOCK) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+ /* No need to do IX-locking or write trx id to buf */
+
+ goto same_trx;
+ }
+
+ trx_write_trx_id(node->trx_id_buf, trx->id);
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+
+ node->trx_id = trx->id;
+ same_trx:
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ if ((node->ins_type == INS_SEARCHED)
+ && (sel_node->state != SEL_NODE_FETCH)) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to insert */
+ thr->run_node = parent;
+
+ return(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_ins(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* err == DB_LOCK_WAIT or SQL error detected */
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c
new file mode 100644
index 00000000000..7f78a5b723b
--- /dev/null
+++ b/storage/innobase/row/row0mysql.c
@@ -0,0 +1,4085 @@
+/******************************************************
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+(c) 2000 Innobase Oy
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "log0log.h"
+#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
+
+/* A dummy variable used to fool the compiler */
+ibool row_mysql_identically_false = FALSE;
+
+/* List of tables we should drop in background. ALTER TABLE in MySQL requires
+that the table handler can drop the table in background when there are no
+queries to it any more. Protected by the kernel mutex. */
+typedef struct row_mysql_drop_struct row_mysql_drop_t;
+struct row_mysql_drop_struct{
+ char* table_name;
+ UT_LIST_NODE_T(row_mysql_drop_t) row_mysql_drop_list;
+};
+
+UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list;
+ibool row_mysql_drop_list_inited = FALSE;
+
+/* Magic table names for invoking various monitor threads */
+static const char S_innodb_monitor[] = "innodb_monitor";
+static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
+static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
+static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+
+/* Name suffix for recovered orphaned temporary tables */
+static const char S_recover_innodb_tmp_table[] = "_recover_innodb_tmp_table";
+/***********************************************************************
+Determine if the given name ends in the suffix reserved for recovered
+orphaned temporary tables. */
+static
+ibool
+row_mysql_is_recovered_tmp_table(
+/*=============================*/
+ /* out: TRUE if table name ends in
+ the reserved suffix */
+ const char* name)
+{
+ ulint namelen = strlen(name) + 1;
+ return(namelen >= sizeof S_recover_innodb_tmp_table
+ && !memcmp(name + namelen -
+ sizeof S_recover_innodb_tmp_table,
+ S_recover_innodb_tmp_table,
+ sizeof S_recover_innodb_tmp_table));
+}
+
+/***********************************************************************
+Determine if the given name is a name reserved for MySQL system tables. */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+ /* out: TRUE if name is a MySQL
+ system table name */
+ const char* name)
+{
+ if (memcmp(name, "mysql/", 6)) {
+ return(FALSE);
+ }
+ return(0 == strcmp(name + 6, "host")
+ || 0 == strcmp(name + 6, "user")
+ || 0 == strcmp(name + 6, "db"));
+}
+
+/***********************************************************************
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+ if (srv_dml_needed_delay) {
+ os_thread_sleep(srv_dml_needed_delay);
+ }
+}
+
+/***********************************************************************
+Frees the blob heap in prebuilt when no longer needed. */
+
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct of a
+ ha_innobase:: table handle */
+{
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+}
+
+/***********************************************************************
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format. */
+
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ /* out: pointer to the data, we skip the 1 or 2 bytes
+ at the start that are used to store the len */
+ byte* dest, /* in: where to store */
+ ulint len, /* in: length, must fit in two bytes */
+ ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */
+{
+ if (lenlen == 2) {
+ ut_a(len < 256 * 256);
+
+ mach_write_to_2_little_endian(dest, len);
+
+ return(dest + 2);
+ }
+
+ ut_a(lenlen == 1);
+ ut_a(len < 256);
+
+ mach_write_to_1(dest, len);
+
+ return(dest + 1);
+}
+
+/***********************************************************************
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data. */
+
+byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ /* out: pointer to the data, we skip the 1 or 2 bytes
+ at the start that are used to store the len */
+ ulint* len, /* out: variable-length field length */
+ byte* field, /* in: field in the MySQL format */
+ ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */
+{
+ if (lenlen == 2) {
+ *len = mach_read_from_2_little_endian(field);
+
+ return(field + 2);
+ }
+
+ ut_a(lenlen == 1);
+
+ *len = mach_read_from_1(field);
+
+ return(field + 1);
+}
+
+/***********************************************************************
+Stores a reference to a BLOB in the MySQL format. */
+
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /* in: where to store */
+ ulint col_len, /* in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ byte* data, /* in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len) /* in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+{
+ /* MySQL might assume the field is set to zero except the length and
+ the pointer fields */
+
+ memset(dest, '\0', col_len);
+
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ ut_a(col_len - 8 > 1 || len < 256);
+ ut_a(col_len - 8 > 2 || len < 256 * 256);
+ ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+ mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+ ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*));
+}
+
+/***********************************************************************
+Reads a reference to a BLOB in the MySQL format. */
+
+byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ /* out: pointer to BLOB data */
+ ulint* len, /* out: BLOB length */
+ byte* ref, /* in: BLOB reference in the MySQL format */
+ ulint col_len) /* in: BLOB reference length (not BLOB
+ length) */
+{
+ byte* data;
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ ut_memcpy((byte*)&data, ref + col_len - 8, sizeof(byte*));
+
+ return(data);
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c. */
+
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ /* out: up to which byte we used
+ buf in the conversion */
+ dfield_t* dfield, /* in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /* in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! */
+ ibool row_format_col, /* TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ byte* mysql_data, /* in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /* in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ibool comp) /* in: TRUE = compact format */
+{
+ byte* ptr = mysql_data;
+ dtype_t* dtype;
+ ulint type;
+ ulint lenlen;
+
+ dtype = dfield_get_type(dfield);
+
+ type = dtype->mtype;
+
+ if (type == DATA_INT) {
+ /* Store integer data in Innobase in a big-endian format,
+ sign bit negated if the data is a signed integer. In MySQL,
+ integers are stored in a little-endian format. */
+
+ ptr = buf + col_len;
+
+ for (;;) {
+ ptr--;
+ *ptr = *mysql_data;
+ if (ptr == buf) {
+ break;
+ }
+ mysql_data++;
+ }
+
+ if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+ *ptr = (byte) (*ptr ^ 128);
+ }
+
+ buf += col_len;
+ } else if ((type == DATA_VARCHAR
+ || type == DATA_VARMYSQL
+ || type == DATA_BINARY)) {
+
+ if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+ /* The length of the actual data is stored to 1 or 2
+ bytes at the start of the field */
+
+ if (row_format_col) {
+ if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+ lenlen = 2;
+ } else {
+ lenlen = 1;
+ }
+ } else {
+ /* In a MySQL key value, lenlen is always 2 */
+ lenlen = 2;
+ }
+
+ ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+ lenlen);
+ } else {
+ /* Remove trailing spaces from old style VARCHAR
+ columns. */
+
+ /* Handle UCS2 strings differently. */
+ ulint mbminlen = dtype_get_mbminlen(dtype);
+
+ ptr = mysql_data;
+
+ if (mbminlen == 2) {
+ /* space=0x0020 */
+ /* Trim "half-chars", just in case. */
+ col_len &= ~1;
+
+ while (col_len >= 2 && ptr[col_len - 2] == 0x00
+ && ptr[col_len - 1] == 0x20) {
+ col_len -= 2;
+ }
+ } else {
+ ut_a(mbminlen == 1);
+ /* space=0x20 */
+ while (col_len > 0
+ && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ }
+ }
+ } else if (comp && type == DATA_MYSQL
+ && dtype_get_mbminlen(dtype) == 1
+ && dtype_get_mbmaxlen(dtype) > 1) {
+ /* In some cases we strip trailing spaces from UTF-8 and other
+ multibyte charsets, from FIXED-length CHAR columns, to save
+ space. UTF-8 would otherwise normally use 3 * the string length
+ bytes to store a latin1 string! */
+
+ /* We assume that this CHAR field is encoded in a
+ variable-length character set where spaces have
+ 1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+ Consider a CHAR(n) field, a field of n characters.
+ It will contain between n * mbminlen and n * mbmaxlen bytes.
+ We will try to truncate it to n bytes by stripping
+ space padding. If the field contains single-byte
+ characters only, it will be truncated to n characters.
+ Consider a CHAR(5) field containing the string ".a "
+ where "." denotes a 3-byte character represented by
+ the bytes "$%&". After our stripping, the string will
+ be stored as "$%&a " (5 bytes). The string ".abc "
+ will be stored as "$%&abc" (6 bytes).
+
+ The space padding will be restored in row0sel.c, function
+ row_sel_field_store_in_mysql_format(). */
+
+ ulint n_chars;
+
+ ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+ n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+ /* Strip space padding. */
+ while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ } else if (type == DATA_BLOB && row_format_col) {
+
+ ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+ }
+
+ dfield_set_data(dfield, ptr, col_len);
+
+ return(buf);
+}
+
+/******************************************************************
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+ dtuple_t* row, /* in/out: Innobase row where the
+ field type information is already
+ copied there! */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct where template
+ must be of type ROW_MYSQL_WHOLE_ROW */
+ byte* mysql_rec) /* in: row in the MySQL format;
+ NOTE: do not discard as long as
+ row is used, as row may contain
+ pointers to this record! */
+{
+ mysql_row_templ_t* templ;
+ dfield_t* dfield;
+ ulint i;
+
+ ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(prebuilt->mysql_template);
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+ dfield = dtuple_get_nth_field(row, i);
+
+ if (templ->mysql_null_bit_mask != 0) {
+ /* Column may be SQL NULL */
+
+ if (mysql_rec[templ->mysql_null_byte_offset] &
+ (byte) (templ->mysql_null_bit_mask)) {
+
+ /* It is SQL NULL */
+
+ dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+ goto next_column;
+ }
+ }
+
+ row_mysql_store_col_in_innobase_format(dfield,
+ prebuilt->ins_upd_rec_buff
+ + templ->mysql_col_offset,
+ TRUE, /* MySQL row format data */
+ mysql_rec + templ->mysql_col_offset,
+ templ->mysql_col_len,
+ prebuilt->table->comp);
+next_column:
+ ;
+ }
+}
+
+/********************************************************************
+Handles user errors and lock waits detected by the database engine. */
+
+ibool
+row_mysql_handle_errors(
+/*====================*/
+ /* out: TRUE if it was a lock wait and
+ we should continue running the query thread */
+ ulint* new_err,/* out: possible new error encountered in
+ lock wait, or if no new error, the value
+ of trx->error_state at the entry of this
+ function */
+ trx_t* trx, /* in: transaction */
+ que_thr_t* thr, /* in: query thread */
+ trx_savept_t* savept) /* in: savepoint or NULL */
+{
+#ifndef UNIV_HOTBACKUP
+ ulint err;
+
+handle_new_error:
+ err = trx->error_state;
+
+ ut_a(err != DB_SUCCESS);
+
+ trx->error_state = DB_SUCCESS;
+
+ if (err == DB_DUPLICATE_KEY) {
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, TRUE, savept);
+ }
+ } else if (err == DB_TOO_BIG_RECORD) {
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, TRUE, savept);
+ }
+ /* MySQL will roll back the latest SQL statement */
+ } else if (err == DB_ROW_IS_REFERENCED
+ || err == DB_NO_REFERENCED_ROW
+ || err == DB_CANNOT_ADD_CONSTRAINT) {
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, TRUE, savept);
+ }
+ /* MySQL will roll back the latest SQL statement */
+ } else if (err == DB_LOCK_WAIT) {
+
+ srv_suspend_mysql_thread(thr);
+
+ if (trx->error_state != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ goto handle_new_error;
+ }
+
+ *new_err = err;
+
+ return(TRUE);
+
+ } else if (err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT
+ || err == DB_LOCK_TABLE_FULL) {
+ /* Roll back the whole transaction; this resolution was added
+ to version 3.23.43 */
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, TRUE, savept);
+ }
+ /* MySQL will roll back the latest SQL statement */
+
+ } else if (err == DB_MUST_GET_MORE_FILE_SPACE) {
+
+ fputs(
+ "InnoDB: The database cannot continue operation because of\n"
+ "InnoDB: lack of space. You must add a new data file to\n"
+ "InnoDB: my.cnf and restart the database.\n", stderr);
+
+ exit(1);
+ } else if (err == DB_CORRUPTION) {
+
+ fputs(
+ "InnoDB: We detected index corruption in an InnoDB type table.\n"
+ "InnoDB: You have to dump + drop + reimport the table or, in\n"
+ "InnoDB: a case of widespread corruption, dump all InnoDB\n"
+ "InnoDB: tables and recreate the whole InnoDB tablespace.\n"
+ "InnoDB: If the mysqld server crashes after the startup or when\n"
+ "InnoDB: you dump the tables, look at\n"
+ "InnoDB: http://dev.mysql.com/doc/mysql/en/Forcing_recovery.html"
+ " for help.\n", stderr);
+
+ } else {
+ fprintf(stderr, "InnoDB: unknown error code %lu\n",
+ (ulong) err);
+ ut_error;
+ }
+
+ if (trx->error_state != DB_SUCCESS) {
+ *new_err = trx->error_state;
+ } else {
+ *new_err = err;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ return(FALSE);
+#else /* UNIV_HOTBACKUP */
+ /* This function depends on MySQL code that is not included in
+ InnoDB Hot Backup builds. Besides, this function should never
+ be called in InnoDB Hot Backup. */
+ ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/************************************************************************
+Create a prebuilt struct for a MySQL table handle. */
+
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ /* out, own: a prebuilt struct */
+ dict_table_t* table) /* in: Innobase table handle */
+{
+ row_prebuilt_t* prebuilt;
+ mem_heap_t* heap;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ ulint ref_len;
+ ulint i;
+
+ heap = mem_heap_create(128);
+
+ prebuilt = mem_heap_alloc(heap, sizeof(row_prebuilt_t));
+
+ prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+ prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+ prebuilt->table = table;
+
+ prebuilt->trx = NULL;
+
+ prebuilt->sql_stat_start = TRUE;
+
+ prebuilt->mysql_has_locked = FALSE;
+
+ prebuilt->index = NULL;
+
+ prebuilt->used_in_HANDLER = FALSE;
+
+ prebuilt->n_template = 0;
+ prebuilt->mysql_template = NULL;
+
+ prebuilt->heap = heap;
+ prebuilt->ins_node = NULL;
+
+ prebuilt->ins_upd_rec_buff = NULL;
+
+ prebuilt->upd_node = NULL;
+ prebuilt->ins_graph = NULL;
+ prebuilt->upd_graph = NULL;
+
+ prebuilt->pcur = btr_pcur_create_for_mysql();
+ prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = 99999999;
+
+ prebuilt->sel_graph = NULL;
+
+ prebuilt->search_tuple = dtuple_create(heap,
+ 2 * dict_table_get_n_cols(table));
+
+ clust_index = dict_table_get_first_index(table);
+
+ /* Make sure that search_tuple is long enough for clustered index */
+ ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ prebuilt->clust_ref = ref;
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ prebuilt->fetch_cache[i] = NULL;
+ }
+
+ prebuilt->n_fetch_cached = 0;
+
+ prebuilt->blob_heap = NULL;
+
+ prebuilt->old_vers_heap = NULL;
+
+ return(prebuilt);
+}
+
+/************************************************************************
+Free a prebuilt struct for a MySQL table handle. */
+
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt) /* in, own: prebuilt struct */
+{
+ ulint i;
+
+ if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED
+ || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED) {
+ fprintf(stderr,
+"InnoDB: Error: trying to free a corrupt\n"
+"InnoDB: table handle. Magic n %lu, magic n2 %lu, table name",
+ (ulong) prebuilt->magic_n,
+ (ulong) prebuilt->magic_n2);
+ ut_print_name(stderr, NULL, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption((byte*)prebuilt);
+
+ ut_error;
+ }
+
+ prebuilt->magic_n = ROW_PREBUILT_FREED;
+ prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+ btr_pcur_free_for_mysql(prebuilt->pcur);
+ btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+ if (prebuilt->mysql_template) {
+ mem_free(prebuilt->mysql_template);
+ }
+
+ if (prebuilt->ins_graph) {
+ que_graph_free_recursive(prebuilt->ins_graph);
+ }
+
+ if (prebuilt->sel_graph) {
+ que_graph_free_recursive(prebuilt->sel_graph);
+ }
+
+ if (prebuilt->upd_graph) {
+ que_graph_free_recursive(prebuilt->upd_graph);
+ }
+
+ if (prebuilt->blob_heap) {
+ mem_heap_free(prebuilt->blob_heap);
+ }
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_free(prebuilt->old_vers_heap);
+ }
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ if (prebuilt->fetch_cache[i] != NULL) {
+
+ if ((ROW_PREBUILT_FETCH_MAGIC_N !=
+ mach_read_from_4((prebuilt->fetch_cache[i]) - 4))
+ || (ROW_PREBUILT_FETCH_MAGIC_N !=
+ mach_read_from_4((prebuilt->fetch_cache[i])
+ + prebuilt->mysql_row_len))) {
+ fputs(
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: fetch buffer.\n", stderr);
+
+ mem_analyze_corruption(
+ prebuilt->fetch_cache[i]);
+
+ ut_error;
+ }
+
+ mem_free((prebuilt->fetch_cache[i]) - 4);
+ }
+ }
+
+ dict_table_decrement_handle_count(prebuilt->table);
+
+ mem_heap_free(prebuilt->heap);
+}
+
+/*************************************************************************
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+
+void
+row_update_prebuilt_trx(
+/*====================*/
+ /* out: prebuilt dtuple */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL
+ handle */
+ trx_t* trx) /* in: transaction handle */
+{
+ if (trx->magic_n != TRX_MAGIC_N) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to use a corrupt\n"
+ "InnoDB: trx handle. Magic n %lu\n",
+ (ulong) trx->magic_n);
+
+ mem_analyze_corruption((byte*)trx);
+
+ ut_error;
+ }
+
+ if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to use a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, NULL, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption((byte*)prebuilt);
+
+ ut_error;
+ }
+
+ prebuilt->trx = trx;
+
+ if (prebuilt->ins_graph) {
+ prebuilt->ins_graph->trx = trx;
+ }
+
+ if (prebuilt->upd_graph) {
+ prebuilt->upd_graph->trx = trx;
+ }
+
+ if (prebuilt->sel_graph) {
+ prebuilt->sel_graph->trx = trx;
+ }
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it. */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+ /* out: prebuilt dtuple; the column
+ type information is also set in it */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ ins_node_t* node;
+ dtuple_t* row;
+ dict_table_t* table = prebuilt->table;
+ ulint i;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->ins_node == NULL) {
+
+ /* Not called before for this handle: create an insert node
+ and query graph to the prebuilt struct */
+
+ node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+ prebuilt->ins_node = node;
+
+ if (prebuilt->ins_upd_rec_buff == NULL) {
+ prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+ prebuilt->heap,
+ prebuilt->mysql_row_len);
+ }
+
+ row = dtuple_create(prebuilt->heap,
+ dict_table_get_n_cols(table));
+
+ dict_table_copy_types(row, table);
+
+ /* We init the value of every field to the SQL NULL to avoid
+ a debug assertion from failing */
+
+ for (i = 0; i < dtuple_get_n_fields(row); i++) {
+
+ dtuple_get_nth_field(row, i)->len = UNIV_SQL_NULL;
+ }
+
+ ins_node_set_new_row(node, row);
+
+ prebuilt->ins_graph =
+ que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+ prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->ins_node->row);
+}
+
+/*************************************************************************
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+ dict_table_t* table) /* in: table */
+{
+ ulint counter;
+
+ counter = table->stat_modified_counter;
+
+ table->stat_modified_counter = counter + 1;
+
+ /* Calculate new statistics if 1 / 16 of table has been modified
+ since the last time a statistics batch was run, or if
+ stat_modified_counter > 2 000 000 000 (to avoid wrap-around).
+ We calculate statistics at most every 16th round, since we may have
+ a counter table which is very small and updated very often. */
+
+ if (counter > 2000000000
+ || ((ib_longlong)counter > 16 + table->stat_n_rows / 16)) {
+
+ dict_update_statistics(table);
+ }
+}
+
+/*************************************************************************
+Unlocks an AUTO_INC type lock possibly reserved by trx. */
+
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+ trx_t* trx) /* in: transaction */
+{
+ if (!trx->auto_inc_lock) {
+
+ return;
+ }
+
+ lock_table_unlock_auto_inc(trx);
+}
+
+/*************************************************************************
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table. */
+
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ /* out: error code or DB_SUCCESS */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in the MySQL
+ table handle */
+{
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (trx->auto_inc_lock) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "setting auto-inc lock";
+
+ if (node == NULL) {
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+ }
+
+ /* We use the insert query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(trx);
+
+ err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Unlocks all table locks explicitly requested by trx (with LOCK TABLES,
+lock type LOCK_TABLE_EXP). */
+
+void
+row_unlock_tables_for_mysql(
+/*========================*/
+ trx_t* trx) /* in: transaction */
+{
+ if (!trx->n_lock_table_exp) {
+
+ return;
+ }
+
+ mutex_enter(&kernel_mutex);
+ lock_release_tables_off_kernel(trx);
+ mutex_exit(&kernel_mutex);
+}
+
+/*************************************************************************
+Sets a table lock on the table mentioned in prebuilt. */
+
+int
+row_lock_table_for_mysql(
+/*=====================*/
+ /* out: error code or DB_SUCCESS */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct in the MySQL
+ table handle */
+ dict_table_t* table, /* in: table to lock, or NULL
+ if prebuilt->table should be
+ locked or a
+ prebuilt->select_lock_type */
+ ulint mode) /* in: lock mode of table */
+{
+ trx_t* trx = prebuilt->trx;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "setting table lock";
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(trx);
+
+ if (table) {
+ err = lock_table(0, table, mode, thr);
+ } else {
+ if (mode == LOCK_TABLE_TRANSACTIONAL) {
+ err = lock_table(LOCK_TABLE_TRANSACTIONAL,
+ prebuilt->table,
+ prebuilt->select_lock_type, thr);
+ } else {
+ err = lock_table(LOCK_TABLE_EXP, prebuilt->table,
+ prebuilt->select_lock_type, thr);
+ }
+ }
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Does an insert for MySQL. */
+
+int
+row_insert_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ byte* mysql_rec, /* in: row in the MySQL format */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ trx_savept_t savept;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (prebuilt->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ return(DB_ERROR);
+ }
+
+ if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, prebuilt->trx, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption((byte*)prebuilt);
+
+ ut_error;
+ }
+
+ if (srv_created_new_raw || srv_force_recovery) {
+ fputs(
+ "InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "inserting";
+
+ row_mysql_delay_if_needed();
+
+ trx_start_if_not_started(trx);
+
+ if (node == NULL) {
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+ }
+
+ row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
+
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ if (prebuilt->sql_stat_start) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ node->state = INS_NODE_ALLOC_ROW_ID;
+ }
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_ins_step(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+ &savept);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ prebuilt->table->stat_n_rows++;
+
+ srv_n_rows_inserted++;
+
+ if (prebuilt->table->stat_n_rows == 0) {
+ /* Avoid wrap-over */
+ prebuilt->table->stat_n_rows--;
+ }
+
+ row_update_statistics_if_needed(prebuilt->table);
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Builds a dummy query graph used in selects. */
+
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ sel_node_t* node;
+
+ ut_ad(prebuilt && prebuilt->trx);
+
+ if (prebuilt->sel_graph == NULL) {
+
+ node = sel_node_create(prebuilt->heap);
+
+ prebuilt->sel_graph =
+ que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+
+ prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+ }
+}
+
+/*************************************************************************
+Creates an query graph node of 'update' type to be used in the MySQL
+interface. */
+
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ /* out, own: update node */
+ dict_table_t* table, /* in: table to update */
+ mem_heap_t* heap) /* in: mem heap from which allocated */
+{
+ upd_node_t* node;
+
+ node = upd_node_create(heap);
+
+ node->in_mysql_interface = TRUE;
+ node->is_delete = FALSE;
+ node->searched_update = FALSE;
+ node->select_will_do_update = FALSE;
+ node->select = NULL;
+ node->pcur = btr_pcur_create_for_mysql();
+ node->table = table;
+
+ node->update = upd_create(dict_table_get_n_cols(table), heap);
+
+ node->update_n_fields = dict_table_get_n_cols(table);
+
+ UT_LIST_INIT(node->columns);
+ node->has_clust_rec_x_lock = TRUE;
+ node->cmpl_info = 0;
+
+ node->table_sym = NULL;
+ node->col_assign_list = NULL;
+
+ return(node);
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it. */
+
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ /* out: prebuilt update vector */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ dict_table_t* table = prebuilt->table;
+ upd_node_t* node;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->upd_node == NULL) {
+
+ /* Not called before for this handle: create an update node
+ and query graph to the prebuilt struct */
+
+ node = row_create_update_node_for_mysql(table, prebuilt->heap);
+
+ prebuilt->upd_node = node;
+
+ prebuilt->upd_graph =
+ que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+ prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->upd_node->update);
+}
+
+/*************************************************************************
+Does an update or delete of a row for MySQL. */
+
+int
+row_update_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ byte* mysql_rec, /* in: the row to be updated, in
+ the MySQL format */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ trx_savept_t savept;
+ ulint err;
+ que_thr_t* thr;
+ ibool was_lock_wait;
+ dict_index_t* clust_index;
+/* ulint ref_len; */
+ upd_node_t* node;
+ dict_table_t* table = prebuilt->table;
+ trx_t* trx = prebuilt->trx;
+
+ ut_ad(prebuilt && trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ UT_NOT_USED(mysql_rec);
+
+ if (prebuilt->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ return(DB_ERROR);
+ }
+
+ if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, prebuilt->trx, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption((byte*)prebuilt);
+
+ ut_error;
+ }
+
+ if (srv_created_new_raw || srv_force_recovery) {
+ fputs(
+ "InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "updating or deleting";
+
+ row_mysql_delay_if_needed();
+
+ trx_start_if_not_started(trx);
+
+ node = prebuilt->upd_node;
+
+ clust_index = dict_table_get_first_index(table);
+
+ if (prebuilt->pcur->btr_cur.index == clust_index) {
+ btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur);
+ } else {
+ btr_pcur_copy_stored_position(node->pcur,
+ prebuilt->clust_pcur);
+ }
+
+ ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+ /* MySQL seems to call rnd_pos before updating each row it
+ has cached: we can get the correct cursor position from
+ prebuilt->pcur; NOTE that we cannot build the row reference
+ from mysql_rec if the clustered index was automatically
+ generated for the table: MySQL does not know anything about
+ the row id used as the clustered index key */
+
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ ut_ad(!prebuilt->sql_stat_start);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ if (err == DB_RECORD_NOT_FOUND) {
+ trx->error_state = DB_SUCCESS;
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ thr->lock_state= QUE_THR_LOCK_ROW;
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+ &savept);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;;
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ if (node->is_delete) {
+ if (prebuilt->table->stat_n_rows > 0) {
+ prebuilt->table->stat_n_rows--;
+ }
+
+ srv_n_rows_deleted++;
+ } else {
+ srv_n_rows_updated++;
+ }
+
+ row_update_statistics_if_needed(prebuilt->table);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Does an unlock of a row for MySQL. */
+
+int
+row_unlock_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ rec_t* rec;
+ btr_pcur_t* cur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ mtr_t mtr;
+
+ ut_ad(prebuilt && trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "unlock_row";
+
+ if (srv_locks_unsafe_for_binlog) {
+ if (trx->trx_create_lock == TRUE) {
+
+ mtr_start(&mtr);
+
+ /* Restore a cursor position and find a record */
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, cur, &mtr);
+ rec = btr_pcur_get_rec(cur);
+
+ if (rec) {
+
+ lock_rec_reset_and_release_wait(rec);
+ } else {
+ fputs("InnoDB: Error: "
+ "Record for the lock not found\n",
+ stderr);
+ mem_analyze_corruption((byte*) trx);
+ ut_error;
+ }
+
+ trx->trx_create_lock = FALSE;
+ mtr_commit(&mtr);
+ }
+
+ }
+
+ trx->op_info = "";
+
+ return(DB_SUCCESS);
+}
+
+/**************************************************************************
+Does a cascaded delete or set null in a foreign key operation. */
+
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+ /* out: error code or DB_SUCCESS */
+ que_thr_t* thr, /* in: query thread */
+ upd_node_t* node, /* in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table) /* in: table where we do the operation */
+{
+ ulint err;
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ /* Note that the cascade node is a subnode of another InnoDB
+ query graph node. We do a normal lock wait in this node, but
+ all errors are handled by the parent node. */
+
+ if (err == DB_LOCK_WAIT) {
+ /* Handle lock wait here */
+
+ que_thr_stop_for_mysql(thr);
+
+ srv_suspend_mysql_thread(thr);
+
+ /* Note that a lock wait may also end in a lock wait timeout,
+ or this transaction is picked as a victim in selective
+ deadlock resolution */
+
+ if (trx->error_state != DB_SUCCESS) {
+
+ return(trx->error_state);
+ }
+
+ /* Retry operation after a normal lock wait */
+
+ goto run_again;
+ }
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ if (node->is_delete) {
+ if (table->stat_n_rows > 0) {
+ table->stat_n_rows--;
+ }
+
+ srv_n_rows_deleted++;
+ } else {
+ srv_n_rows_updated++;
+ }
+
+ row_update_statistics_if_needed(table);
+
+ return(err);
+}
+
+/*************************************************************************
+Checks if a table is such that we automatically created a clustered
+index on it (on row id). */
+
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+ dict_table_t* table)
+{
+ dict_index_t* clust_index;
+
+ clust_index = dict_table_get_first_index(table);
+
+ if (dtype_get_mtype(dict_index_get_nth_type(clust_index, 0))
+ == DATA_SYS) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Calculates the key number used inside MySQL for an Innobase index. We have
+to take into account if we generated a default clustered index for the table */
+
+ulint
+row_get_mysql_key_number_for_index(
+/*===============================*/
+ dict_index_t* index)
+{
+ dict_index_t* ind;
+ ulint i;
+
+ ut_a(index);
+
+ i = 0;
+ ind = dict_table_get_first_index(index->table);
+
+ while (index != ind) {
+ ind = dict_table_get_next_index(ind);
+ i++;
+ }
+
+ if (row_table_got_default_clust_index(index->table)) {
+ ut_a(i > 0);
+ i--;
+ }
+
+ return(i);
+}
+
+/*************************************************************************
+Recovers an orphaned tmp table inside InnoDB by renaming it. In the table
+name #sql becomes rsql, and "_recover_innodb_tmp_table" is catenated to
+the end of name. table->name should be of the form
+"dbname/rsql..._recover_innodb_tmp_table". This renames a table whose
+name is "#sql..." */
+static
+int
+row_mysql_recover_tmp_table(
+/*========================*/
+ /* out: error code or DB_SUCCESS */
+ dict_table_t* table, /* in: table definition */
+ trx_t* trx) /* in: transaction handle */
+{
+ const char* ptr = strstr(table->name, "/rsql");
+
+ if (!ptr) {
+ /* table name does not begin with "/rsql" */
+ trx_commit_for_mysql(trx);
+ return(DB_ERROR);
+ }
+ else {
+ int status;
+ int namelen = (int) strlen(table->name);
+ char* old_name = mem_strdupl(table->name, namelen);
+ /* replace "rsql" with "#sql" */
+ old_name[ptr - table->name + 1] = '#';
+ /* remove "_recover_innodb_tmp_table" suffix */
+ ut_ad(namelen > (int) sizeof S_recover_innodb_tmp_table);
+ ut_ad(!strcmp(old_name + namelen + 1 -
+ sizeof S_recover_innodb_tmp_table,
+ S_recover_innodb_tmp_table));
+ old_name[namelen + 1 - sizeof S_recover_innodb_tmp_table] = 0;
+ status = row_rename_table_for_mysql(old_name,
+ table->name, trx);
+ mem_free(old_name);
+ return(status);
+ }
+}
+
+/*************************************************************************
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+
+void
+row_mysql_freeze_data_dictionary(
+/*=============================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ rw_lock_s_lock(&dict_operation_lock);
+
+ trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*************************************************************************
+Unlocks the data dictionary shared lock. */
+
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+ rw_lock_s_unlock(&dict_operation_lock);
+
+ trx->dict_operation_lock_mode = 0;
+}
+
+/*************************************************************************
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+
+void
+row_mysql_lock_data_dictionary(
+/*===========================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_a(trx->dict_operation_lock_mode == 0
+ || trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks or lock waits can occur then in these operations */
+
+ rw_lock_x_lock(&dict_operation_lock);
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+
+ mutex_enter(&(dict_sys->mutex));
+}
+
+/*************************************************************************
+Unlocks the data dictionary exclusive lock. */
+
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ mutex_exit(&(dict_sys->mutex));
+ rw_lock_x_unlock(&dict_operation_lock);
+
+ trx->dict_operation_lock_mode = 0;
+}
+
+/*************************************************************************
+Does a table creation operation for MySQL. If the name of the table
+to be created is equal with one of the predefined magic table names,
+then this also starts printing the corresponding monitor output by
+the master thread. */
+
+int
+row_create_table_for_mysql(
+/*=======================*/
+ /* out: error code or DB_SUCCESS */
+ dict_table_t* table, /* in: table definition */
+ trx_t* trx) /* in: transaction handle */
+{
+ tab_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ const char* table_name;
+ ulint table_name_len;
+ ulint err;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ if (srv_created_new_raw) {
+ fputs(
+ "InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ trx_commit_for_mysql(trx);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "creating table";
+
+ if (row_mysql_is_system_table(table->name)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to create a MySQL system table %s of type InnoDB.\n"
+ "InnoDB: MySQL system tables must be of the MyISAM type!\n",
+ table->name);
+
+ trx_commit_for_mysql(trx);
+
+ return(DB_ERROR);
+ }
+
+ trx_start_if_not_started(trx);
+
+ if (row_mysql_is_recovered_tmp_table(table->name)) {
+
+ /* MySQL prevents accessing of tables whose name begins
+ with #sql, that is temporary tables. If mysqld crashes in
+ the middle of an ALTER TABLE, we may get an orphaned
+ #sql-table in the tablespace. We have here a special
+ mechanism to recover such tables by renaming them to
+ rsql... */
+
+ return(row_mysql_recover_tmp_table(table, trx));
+ }
+
+ /* The table name is prefixed with the database name and a '/'.
+ Certain table names starting with 'innodb_' have their special
+ meaning regardless of the database name. Thus, we need to
+ ignore the database name prefix in the comparisons. */
+ table_name = strchr(table->name, '/');
+ ut_a(table_name);
+ table_name++;
+ table_name_len = strlen(table_name) + 1;
+
+ if (table_name_len == sizeof S_innodb_monitor
+ && !memcmp(table_name, S_innodb_monitor,
+ sizeof S_innodb_monitor)) {
+
+ /* Table equals "innodb_monitor":
+ start monitor prints */
+
+ srv_print_innodb_monitor = TRUE;
+
+ /* The lock timeout monitor thread also takes care
+ of InnoDB monitor prints */
+
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (table_name_len == sizeof S_innodb_lock_monitor
+ && !memcmp(table_name, S_innodb_lock_monitor,
+ sizeof S_innodb_lock_monitor)) {
+
+ srv_print_innodb_monitor = TRUE;
+ srv_print_innodb_lock_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (table_name_len == sizeof S_innodb_tablespace_monitor
+ && !memcmp(table_name, S_innodb_tablespace_monitor,
+ sizeof S_innodb_tablespace_monitor)) {
+
+ srv_print_innodb_tablespace_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (table_name_len == sizeof S_innodb_table_monitor
+ && !memcmp(table_name, S_innodb_table_monitor,
+ sizeof S_innodb_table_monitor)) {
+
+ srv_print_innodb_table_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (table_name_len == sizeof S_innodb_mem_validate
+ && !memcmp(table_name, S_innodb_mem_validate,
+ sizeof S_innodb_mem_validate)) {
+ /* We define here a debugging feature intended for
+ developers */
+
+ fputs("Validating InnoDB memory:\n"
+ "to use this feature you must compile InnoDB with\n"
+ "UNIV_MEM_DEBUG defined in univ.i and the server must be\n"
+ "quiet because allocation from a mem heap is not protected\n"
+ "by any semaphore.\n", stderr);
+#ifdef UNIV_MEM_DEBUG
+ ut_a(mem_validate());
+ fputs("Memory validated\n", stderr);
+#else /* UNIV_MEM_DEBUG */
+ fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
+ stderr);
+#endif /* UNIV_MEM_DEBUG */
+ }
+
+ heap = mem_heap_create(512);
+
+ trx->dict_operation = TRUE;
+
+ node = tab_create_graph_create(table, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ fputs("InnoDB: Warning: cannot create table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs(" because tablespace full\n", stderr);
+ row_drop_table_for_mysql(table->name, trx, FALSE);
+
+ } else if (err == DB_DUPLICATE_KEY) {
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs(" already exists in InnoDB internal\n"
+ "InnoDB: data dictionary. Have you deleted the .frm file\n"
+ "InnoDB: and not used DROP TABLE? Have you used DROP DATABASE\n"
+ "InnoDB: for InnoDB tables in MySQL version <= 3.23.43?\n"
+ "InnoDB: See the Restrictions section of the InnoDB manual.\n"
+ "InnoDB: You can drop the orphaned table inside InnoDB by\n"
+ "InnoDB: creating an InnoDB table with the same name in another\n"
+ "InnoDB: database and copying the .frm file to the current database.\n"
+ "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n"
+ "InnoDB: succeed.\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+ "InnoDB_troubleshooting_datadict.html\n", stderr);
+ }
+
+ /* We may also get err == DB_ERROR if the .ibd file for the
+ table already exists */
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table. */
+
+int
+row_create_index_for_mysql(
+/*=======================*/
+ /* out: error number or DB_SUCCESS */
+ dict_index_t* index, /* in: index definition */
+ trx_t* trx) /* in: transaction handle */
+{
+ ind_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ ulint err;
+ ulint i, j;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "creating index";
+
+ trx_start_if_not_started(trx);
+
+ /* Check that the same column does not appear twice in the index.
+ Starting from 4.0.14, InnoDB should be able to cope with that, but
+ safer not to allow them. */
+
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ for (j = 0; j < i; j++) {
+ if (0 == ut_strcmp(
+ dict_index_get_nth_field(index, j)->name,
+ dict_index_get_nth_field(index, i)->name)) {
+
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: column ", stderr);
+ ut_print_name(stderr, trx,
+ dict_index_get_nth_field(index, i)->name);
+ fputs(" appears twice in ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs("\n"
+ "InnoDB: This is not allowed in InnoDB.\n",
+ stderr);
+
+ err = DB_COL_APPEARS_TWICE_IN_INDEX;
+
+ goto error_handling;
+ }
+ }
+
+ /* Check also that prefix_len < DICT_MAX_COL_PREFIX_LEN */
+
+ if (dict_index_get_nth_field(index, i)->prefix_len
+ >= DICT_MAX_COL_PREFIX_LEN) {
+ err = DB_TOO_BIG_RECORD;
+
+ goto error_handling;
+ }
+ }
+
+ if (row_mysql_is_recovered_tmp_table(index->table_name)) {
+
+ return(DB_SUCCESS);
+ }
+
+ heap = mem_heap_create(512);
+
+ trx->dict_operation = TRUE;
+
+ /* Note that the space id where we store the index is inherited from
+ the table in dict_build_index_def_step() in dict0crea.c. */
+
+ node = ind_create_graph_create(index, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+error_handling:
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ row_drop_table_for_mysql(index->table_name, trx, FALSE);
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint. Check also that foreign key
+constraints which reference this table are ok. */
+
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx, /* in: transaction */
+ const char* sql_string, /* in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES table2(c, d),
+ table2 can be written also with the
+ database name before it: test.table2 */
+ const char* name) /* in: table full name in the
+ normalized form
+ database_name/table_name */
+{
+ ulint err;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_a(sql_string);
+
+ trx->op_info = "adding foreign keys";
+
+ trx_start_if_not_started(trx);
+
+ if (row_mysql_is_recovered_tmp_table(name)) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->dict_operation = TRUE;
+
+ err = dict_create_foreign_constraints(trx, sql_string, name);
+
+ if (err == DB_SUCCESS) {
+ /* Check that also referencing constraints are ok */
+ err = dict_load_foreigns(name, trx->check_foreigns);
+ }
+
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ row_drop_table_for_mysql(name, trx, FALSE);
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ return((int) err);
+}
+
+/*************************************************************************
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.c. */
+static
+int
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+ /* out: error code or DB_SUCCESS */
+ const char* name) /* in: table name */
+{
+ ulint error;
+ trx_t* trx;
+
+ trx = trx_allocate_for_background();
+
+ /* If the original transaction was dropping a table referenced by
+ foreign keys, we must set the following to be able to drop the
+ table: */
+
+ trx->check_foreigns = FALSE;
+
+/* fputs("InnoDB: Error: Dropping table ", stderr);
+ ut_print_name(stderr, name);
+ fputs(" in background drop list\n", stderr); */
+
+ /* Try to drop the table in InnoDB */
+
+ error = row_drop_table_for_mysql(name, trx, FALSE);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ trx_commit_for_mysql(trx);
+
+ trx_free_for_background(trx);
+
+ return((int) error);
+}
+
+/*************************************************************************
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix. */
+
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+ /* out: how many tables dropped
+ + remaining tables in list */
+{
+ row_mysql_drop_t* drop;
+ dict_table_t* table;
+ ulint n_tables;
+ ulint n_tables_dropped = 0;
+loop:
+ mutex_enter(&kernel_mutex);
+
+ if (!row_mysql_drop_list_inited) {
+
+ UT_LIST_INIT(row_mysql_drop_list);
+ row_mysql_drop_list_inited = TRUE;
+ }
+
+ drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+ n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+ mutex_exit(&kernel_mutex);
+
+ if (drop == NULL) {
+ /* All tables dropped */
+
+ return(n_tables + n_tables_dropped);
+ }
+
+ mutex_enter(&(dict_sys->mutex));
+ table = dict_table_get_low(drop->table_name);
+ mutex_exit(&(dict_sys->mutex));
+
+ if (table == NULL) {
+ /* If for some reason the table has already been dropped
+ through some other mechanism, do not try to drop it */
+
+ goto already_dropped;
+ }
+
+ if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
+ drop->table_name)) {
+ /* If the DROP fails for some table, we return, and let the
+ main thread retry later */
+
+ return(n_tables + n_tables_dropped);
+ }
+
+ n_tables_dropped++;
+
+already_dropped:
+ mutex_enter(&kernel_mutex);
+
+ UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Dropped table %s in background drop queue.\n",
+ drop->table_name);
+
+ mem_free(drop->table_name);
+
+ mem_free(drop);
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+}
+
+/*************************************************************************
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex! */
+
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+ /* out: how many tables in list */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!row_mysql_drop_list_inited) {
+
+ UT_LIST_INIT(row_mysql_drop_list);
+ row_mysql_drop_list_inited = TRUE;
+ }
+
+ return(UT_LIST_GET_LEN(row_mysql_drop_list));
+}
+
+/*************************************************************************
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily. */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+ /* out: TRUE if the table was not yet in the
+ drop list, and was added there */
+ dict_table_t* table) /* in: table */
+{
+ row_mysql_drop_t* drop;
+
+ mutex_enter(&kernel_mutex);
+
+ if (!row_mysql_drop_list_inited) {
+
+ UT_LIST_INIT(row_mysql_drop_list);
+ row_mysql_drop_list_inited = TRUE;
+ }
+
+ /* Look if the table already is in the drop list */
+ drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+ while (drop != NULL) {
+ if (strcmp(drop->table_name, table->name) == 0) {
+ /* Already in the list */
+
+ mutex_exit(&kernel_mutex);
+
+ return(FALSE);
+ }
+
+ drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
+ }
+
+ drop = mem_alloc(sizeof(row_mysql_drop_t));
+
+ drop->table_name = mem_strdup(table->name);
+
+ UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+/* fputs("InnoDB: Adding table ", stderr);
+ ut_print_name(stderr, drop->table_name);
+ fputs(" to background drop list\n", stderr); */
+
+ mutex_exit(&kernel_mutex);
+
+ return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************************
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE. */
+
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ /* out: error code or DB_SUCCESS */
+ const char* name, /* in: table name */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_foreign_t* foreign;
+ dulint new_id;
+ dict_table_t* table;
+ que_thr_t* thr;
+ que_t* graph = NULL;
+ ibool success;
+ ulint err;
+ char* buf;
+
+/* How do we prevent crashes caused by ongoing operations on the table? Old
+operations could try to access non-existent pages.
+
+1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock
+on the table before we can do DISCARD TABLESPACE. Then there are no running
+queries on the table.
+2) Purge and rollback: we assign a new table id for the table. Since purge and
+rollback look for the table based on the table id, they see the table as
+'dropped' and discard their operations.
+3) Insert buffer: we remove all entries for the tablespace in the insert
+buffer tree; as long as the tablespace mem object does not exist, ongoing
+insert buffer page merges are discarded in buf0rea.c. If we recreate the
+tablespace mem object with IMPORT TABLESPACE later, then the tablespace will
+have the same id, but the tablespace_version field in the mem object is
+different, and ongoing old insert buffer page merges get discarded.
+4) Linear readahead and random readahead: we use the same method as in 3) to
+discard ongoing operations.
+5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we
+do not allow the discard. We also reserve the data dictionary latch. */
+
+ static const char discard_tablespace_proc1[] =
+ "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
+ "old_id CHAR;\n"
+ "new_id CHAR;\n"
+ "new_id_low INT;\n"
+ "new_id_high INT;\n"
+ "table_name CHAR;\n"
+ "BEGIN\n"
+ "table_name := '";
+ static const char discard_tablespace_proc2[] =
+ "';\n"
+ "new_id_high := %lu;\n"
+ "new_id_low := %lu;\n"
+ "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n"
+ "SELECT ID INTO old_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = table_name;\n"
+ "IF (SQL %% NOTFOUND) THEN\n"
+ " COMMIT WORK;\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "UPDATE SYS_TABLES SET ID = new_id\n"
+ "WHERE ID = old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n"
+ "WHERE TABLE_ID = old_id;\n"
+ "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n"
+ "WHERE TABLE_ID = old_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n";
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "discarding tablespace";
+ trx_start_if_not_started(trx);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ if (table->space == 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs("\n"
+"InnoDB: is in the system tablespace 0 which cannot be discarded\n", stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ if (table->n_foreign_key_checks_running > 0) {
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: You are trying to DISCARD table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs("\n"
+ "InnoDB: though there is a foreign key check running on it.\n"
+ "InnoDB: Cannot discard the table.\n",
+ stderr);
+
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign && foreign->foreign_table == table) {
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (foreign && trx->check_foreigns) {
+
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow discarding a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ err = DB_CANNOT_DROP_CONSTRAINT;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot DISCARD table ", ef);
+ ut_print_name(ef, trx, name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ goto funct_exit;
+ }
+
+ new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+ buf = mem_alloc((sizeof discard_tablespace_proc1) +
+ (sizeof discard_tablespace_proc2) +
+ 20 + ut_strlenq(name, '\''));
+
+ memcpy(buf, discard_tablespace_proc1, sizeof discard_tablespace_proc1);
+ sprintf(ut_strcpyq(buf + (sizeof discard_tablespace_proc1 - 1),
+ '\'', name),
+ discard_tablespace_proc2,
+ (ulong) ut_dulint_get_high(new_id),
+ (ulong) ut_dulint_get_low(new_id));
+
+ graph = pars_sql(buf);
+
+ ut_a(graph);
+
+ /* Remove any locks there are on the table or its records */
+
+ lock_reset_all_on_table(table);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+ } else {
+ dict_table_change_id_in_cache(table, new_id);
+
+ success = fil_discard_tablespace(table->space);
+
+ if (!success) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+
+ err = DB_ERROR;
+ } else {
+ /* Set the flag which tells that now it is legal to
+ IMPORT a tablespace for this table */
+ table->tablespace_discarded = TRUE;
+ table->ibd_file_missing = TRUE;
+ }
+ }
+funct_exit:
+ row_mysql_unlock_data_dictionary(trx);
+
+ if (graph) {
+ que_graph_free(graph);
+ }
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary. */
+
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+ /* out: error code or DB_SUCCESS */
+ const char* name, /* in: table name */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_table_t* table;
+ ibool success;
+ dulint current_lsn;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx_start_if_not_started(trx);
+
+ trx->op_info = "importing tablespace";
+
+ current_lsn = log_get_lsn();
+
+ /* It is possible, though very improbable, that the lsn's in the
+ tablespace to be imported have risen above the current system lsn, if
+ a lengthy purge, ibuf merge, or rollback was performed on a backup
+ taken with ibbackup. If that is the case, reset page lsn's in the
+ file. We assume that mysqld was shut down after it performed these
+ cleanup operations on the .ibd file, so that it stamped the latest lsn
+ to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+
+ TODO: reset also the trx id's in clustered index records and write
+ a new space id to each data page. That would allow us to import clean
+ .ibd files from another MySQL installation. */
+
+ success = fil_reset_too_high_lsns(name, current_lsn);
+
+ if (!success) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs("\n"
+ "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", stderr);
+
+ err = DB_ERROR;
+
+ row_mysql_lock_data_dictionary(trx);
+
+ goto funct_exit;
+ }
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: table ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs("\n"
+"InnoDB: does not exist in the InnoDB data dictionary\n"
+"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+ stderr);
+
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ if (table->space == 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs("\n"
+"InnoDB: is in the system tablespace 0 which cannot be imported\n", stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ if (!table->tablespace_discarded) {
+ ut_print_timestamp(stderr);
+ fputs(
+" InnoDB: Error: you are trying to IMPORT a tablespace\n"
+"InnoDB: ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs(", though you have not called DISCARD on it yet\n"
+"InnoDB: during the lifetime of the mysqld process!\n", stderr);
+
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Play safe and remove all insert buffer entries, though we should
+ have removed them already when DISCARD TABLESPACE was called */
+
+ ibuf_delete_for_discarded_space(table->space);
+
+ success = fil_open_single_table_tablespace(TRUE, table->space,
+ table->name);
+ if (success) {
+ table->ibd_file_missing = FALSE;
+ table->tablespace_discarded = FALSE;
+ } else {
+ if (table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fputs(
+" InnoDB: cannot find or open in the database directory the .ibd file of\n"
+"InnoDB: table ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs("\n"
+"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+ stderr);
+ }
+
+ err = DB_ERROR;
+ }
+
+funct_exit:
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Truncates a table for MySQL. */
+
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+ /* out: error code or DB_SUCCESS */
+ dict_table_t* table, /* in: table handle */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_foreign_t* foreign;
+ ulint err;
+ mem_heap_t* heap;
+ byte* buf;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ dulint new_id;
+ char* sql;
+ que_thr_t* thr;
+ que_t* graph = NULL;
+
+/* How do we prevent crashes caused by ongoing operations on the table? Old
+operations could try to access non-existent pages.
+
+1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock
+on the table before we can do TRUNCATE TABLE. Then there are no running
+queries on the table. This is guaranteed, because in
+ha_innobase::store_lock(), we do not weaken the TL_WRITE lock requested
+by MySQL when executing SQLCOM_TRUNCATE.
+2) Purge and rollback: we assign a new table id for the table. Since purge and
+rollback look for the table based on the table id, they see the table as
+'dropped' and discard their operations.
+3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, so we do not
+have to remove insert buffer records, as the insert buffer works at a low
+level. If a freed page is later reallocated, the allocator will remove
+the ibuf entries for it.
+
+TODO: when we truncate *.ibd files (analogous to DISCARD TABLESPACE), we
+will have to remove we remove all entries for the table in the insert
+buffer tree!
+
+4) Linear readahead and random readahead: we use the same method as in 3) to
+discard ongoing operations. (This will only be relevant for TRUNCATE TABLE
+by DISCARD TABLESPACE.)
+5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we
+do not allow the TRUNCATE. We also reserve the data dictionary latch. */
+
+ static const char renumber_tablespace_proc[] =
+ "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+ "old_id CHAR;\n"
+ "new_id CHAR;\n"
+ "old_id_low INT;\n"
+ "old_id_high INT;\n"
+ "new_id_low INT;\n"
+ "new_id_high INT;\n"
+ "BEGIN\n"
+ "old_id_high := %lu;\n"
+ "old_id_low := %lu;\n"
+ "new_id_high := %lu;\n"
+ "new_id_low := %lu;\n"
+ "old_id := CONCAT(TO_BINARY(old_id_high, 4), TO_BINARY(old_id_low, 4));\n"
+ "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n"
+ "UPDATE SYS_TABLES SET ID = new_id\n"
+ "WHERE ID = old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n"
+ "WHERE TABLE_ID = old_id;\n"
+ "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n"
+ "WHERE TABLE_ID = old_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n";
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_ad(table);
+
+ if (srv_created_new_raw) {
+ fputs(
+ "InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "truncating table";
+
+ trx_start_if_not_started(trx);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+ /* Prevent foreign key checks etc. while we are truncating the
+ table */
+
+ row_mysql_lock_data_dictionary(trx);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign && foreign->foreign_table == table) {
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (foreign && trx->check_foreigns) {
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow truncating a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot truncate table ", ef);
+ ut_print_name(ef, trx, table->name);
+ fputs(" by DROP+CREATE\n"
+ "InnoDB: because it is referenced by ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ err = DB_ERROR;
+ goto funct_exit;
+ }
+
+ /* TODO: could we replace the counter n_foreign_key_checks_running
+ with lock checks on the table? Acquire here an exclusive lock on the
+ table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+ they can cope with the table having been truncated here? Foreign key
+ checks take an IS or IX lock on the table. */
+
+ if (table->n_foreign_key_checks_running > 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Cannot truncate table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs(" by DROP+CREATE\n"
+"InnoDB: because there is a foreign key check running on it.\n",
+ stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Remove any locks there are on the table or its records */
+
+ lock_reset_all_on_table(table);
+
+ trx->table_id = table->id;
+
+ /* scan SYS_INDEXES for all indexes of the table */
+ heap = mem_heap_create(800);
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+ sys_index = dict_table_get_first_index(dict_sys->sys_indexes);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ mtr_start(&mtr);
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_MODIFY_LEAF, &pcur, &mtr);
+ for (;;) {
+ rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint root_page_no;
+
+ if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) {
+ /* The end of SYS_INDEXES has been reached. */
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_ad(len == 8);
+
+ if (memcmp(buf, field, len) != 0) {
+ /* End of indexes for the table (TABLE_ID mismatch). */
+ break;
+ }
+
+ if (rec_get_deleted_flag(rec, FALSE)) {
+ /* The index has been dropped. */
+ goto next_rec;
+ }
+
+ btr_pcur_store_position(&pcur, &mtr);
+
+ /* This call may commit and restart mtr. */
+ root_page_no = dict_truncate_index_tree(table, rec, &mtr);
+
+ btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (root_page_no != FIL_NULL) {
+ page_rec_write_index_page_no(rec,
+ DICT_SYS_INDEXES_PAGE_NO_FIELD,
+ root_page_no, &mtr);
+ /* We will need to commit and restart the
+ mini-transaction in order to avoid deadlocks.
+ The dict_truncate_index_tree() call has allocated
+ a page in this mini-transaction, and the rest of
+ this loop could latch another index page. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+ }
+
+ next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+ mem_heap_empty(heap);
+ sql = mem_heap_alloc(heap, (sizeof renumber_tablespace_proc) + 40);
+ sprintf(sql, renumber_tablespace_proc,
+ (ulong) ut_dulint_get_high(table->id),
+ (ulong) ut_dulint_get_low(table->id),
+ (ulong) ut_dulint_get_high(new_id),
+ (ulong) ut_dulint_get_low(new_id));
+
+ graph = pars_sql(sql);
+
+ ut_a(graph);
+
+ mem_heap_free(heap);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ thr = que_fork_start_command(graph);
+ ut_a(thr);
+
+ que_run_threads(thr);
+
+ que_graph_free(graph);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+ ut_print_timestamp(stderr);
+fputs(" InnoDB: Unable to assign a new identifier to table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs("\n"
+"InnoDB: after truncating it. Background processes may corrupt the table!\n",
+ stderr);
+ err = DB_ERROR;
+ } else {
+ dict_table_change_id_in_cache(table, new_id);
+ }
+
+ dict_table_autoinc_initialize(table, 0);
+ dict_update_statistics(table);
+
+ trx_commit_for_mysql(trx);
+
+funct_exit:
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ srv_wake_master_thread();
+
+ return((int) err);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************************
+Drops a table for MySQL. If the name of the table to be dropped is equal
+with one of the predefined magic table names, then this also stops printing
+the corresponding monitor output by the master thread. */
+
+int
+row_drop_table_for_mysql(
+/*=====================*/
+ /* out: error code or DB_SUCCESS */
+ const char* name, /* in: table name */
+ trx_t* trx, /* in: transaction handle */
+ ibool drop_db)/* in: TRUE=dropping whole database */
+{
+ dict_foreign_t* foreign;
+ dict_table_t* table;
+ ulint space_id;
+ que_thr_t* thr;
+ que_t* graph;
+ ulint err;
+ const char* table_name;
+ ulint namelen;
+ char* dir_path_of_temp_table = NULL;
+ ibool success;
+ ibool locked_dictionary = FALSE;
+ char* quoted_name;
+ char* sql;
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in deleting the dictionary data from system
+ tables in Innobase. Deleting a row from SYS_INDEXES table also
+ frees the file segments of the B-tree associated with the index. */
+
+ static const char str1[] =
+ "PROCEDURE DROP_TABLE_PROC () IS\n"
+ "table_name CHAR;\n"
+ "sys_foreign_id CHAR;\n"
+ "table_id CHAR;\n"
+ "index_id CHAR;\n"
+ "foreign_id CHAR;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "table_name := ";
+ static const char str2[] =
+ ";\n"
+ "SELECT ID INTO table_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = table_name;\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " COMMIT WORK;\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "found := 1;\n"
+ "SELECT ID INTO sys_foreign_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = 'SYS_FOREIGN';\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ "END IF;\n"
+ "IF (table_name = 'SYS_FOREIGN') THEN\n"
+ " found := 0;\n"
+ "END IF;\n"
+ "IF (table_name = 'SYS_FOREIGN_COLS') THEN\n"
+ " found := 0;\n"
+ "END IF;\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO foreign_id\n"
+ " FROM SYS_FOREIGN\n"
+ " WHERE FOR_NAME = table_name\n"
+ " AND TO_BINARY(FOR_NAME) = TO_BINARY(table_name);\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE"
+ " DELETE FROM SYS_FOREIGN_COLS WHERE ID = foreign_id;\n"
+ " DELETE FROM SYS_FOREIGN WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "found := 1;\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO index_id\n"
+ " FROM SYS_INDEXES\n"
+ " WHERE TABLE_ID = table_id;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n"
+ " DELETE FROM SYS_INDEXES WHERE ID = index_id\n"
+ " AND TABLE_ID = table_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n"
+ "DELETE FROM SYS_TABLES WHERE ID = table_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n";
+
+ ut_a(name != NULL);
+
+ if (srv_created_new_raw) {
+ fputs(
+ "InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "dropping table";
+
+ trx_start_if_not_started(trx);
+
+ /* The table name is prefixed with the database name and a '/'.
+ Certain table names starting with 'innodb_' have their special
+ meaning regardless of the database name. Thus, we need to
+ ignore the database name prefix in the comparisons. */
+ table_name = strchr(name, '/');
+ ut_a(table_name);
+ table_name++;
+ namelen = strlen(table_name) + 1;
+
+ if (namelen == sizeof S_innodb_monitor
+ && !memcmp(table_name, S_innodb_monitor,
+ sizeof S_innodb_monitor)) {
+
+ /* Table name equals "innodb_monitor":
+ stop monitor prints */
+
+ srv_print_innodb_monitor = FALSE;
+ srv_print_innodb_lock_monitor = FALSE;
+ } else if (namelen == sizeof S_innodb_lock_monitor
+ && !memcmp(table_name, S_innodb_lock_monitor,
+ sizeof S_innodb_lock_monitor)) {
+ srv_print_innodb_monitor = FALSE;
+ srv_print_innodb_lock_monitor = FALSE;
+ } else if (namelen == sizeof S_innodb_tablespace_monitor
+ && !memcmp(table_name, S_innodb_tablespace_monitor,
+ sizeof S_innodb_tablespace_monitor)) {
+
+ srv_print_innodb_tablespace_monitor = FALSE;
+ } else if (namelen == sizeof S_innodb_table_monitor
+ && !memcmp(table_name, S_innodb_table_monitor,
+ sizeof S_innodb_table_monitor)) {
+
+ srv_print_innodb_table_monitor = FALSE;
+ }
+
+ quoted_name = mem_strdupq(name, '\'');
+ namelen = strlen(quoted_name);
+ sql = mem_alloc((sizeof str1) + (sizeof str2) - 2 + 1 + namelen);
+ memcpy(sql, str1, (sizeof str1) - 1);
+ memcpy(sql + (sizeof str1) - 1, quoted_name, namelen);
+ memcpy(sql + (sizeof str1) - 1 + namelen, str2, sizeof str2);
+ mem_free(quoted_name);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+ /* Prevent foreign key checks etc. while we are dropping the
+ table */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ locked_dictionary = TRUE;
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ graph = pars_sql(sql);
+
+ ut_a(graph);
+ mem_free(sql);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs(" does not exist in the InnoDB internal\n"
+ "InnoDB: data dictionary though MySQL is trying to drop it.\n"
+ "InnoDB: Have you copied the .frm file of the table to the\n"
+ "InnoDB: MySQL database directory from another database?\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+ "InnoDB_troubleshooting_datadict.html\n", stderr);
+ goto funct_exit;
+ }
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign && foreign->foreign_table == table) {
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (foreign && trx->check_foreigns &&
+ !(drop_db && dict_tables_have_same_db(
+ name, foreign->foreign_table_name))) {
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow dropping a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ err = DB_CANNOT_DROP_CONSTRAINT;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot drop table ", ef);
+ ut_print_name(ef, trx, name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ goto funct_exit;
+ }
+
+ if (table->n_mysql_handles_opened > 0) {
+ ibool added;
+
+ added = row_add_table_to_background_drop_list(table);
+
+ if (added) {
+ ut_print_timestamp(stderr);
+fputs(" InnoDB: Warning: MySQL is trying to drop table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs("\n"
+"InnoDB: though there are still open handles to it.\n"
+"InnoDB: Adding the table to the background drop queue.\n",
+ stderr);
+
+ /* We return DB_SUCCESS to MySQL though the drop will
+ happen lazily later */
+
+ err = DB_SUCCESS;
+ } else {
+ /* The table is already in the background drop list */
+ err = DB_ERROR;
+ }
+
+ goto funct_exit;
+ }
+
+ /* TODO: could we replace the counter n_foreign_key_checks_running
+ with lock checks on the table? Acquire here an exclusive lock on the
+ table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+ they can cope with the table having been dropped here? Foreign key
+ checks take an IS or IX lock on the table. */
+
+ if (table->n_foreign_key_checks_running > 0) {
+
+ ibool added;
+
+ added = row_add_table_to_background_drop_list(table);
+
+ if (added) {
+ ut_print_timestamp(stderr);
+fputs(" InnoDB: You are trying to drop table ", stderr);
+ ut_print_name(stderr, trx, table->name);
+ fputs("\n"
+"InnoDB: though there is a foreign key check running on it.\n"
+"InnoDB: Adding the table to the background drop queue.\n",
+ stderr);
+
+ /* We return DB_SUCCESS to MySQL though the drop will
+ happen lazily later */
+
+ err = DB_SUCCESS;
+ } else {
+ /* The table is already in the background drop list */
+ err = DB_ERROR;
+ }
+
+ goto funct_exit;
+ }
+
+ /* Remove any locks there are on the table or its records */
+
+ lock_reset_all_on_table(table);
+
+ trx->dict_operation = TRUE;
+ trx->table_id = table->id;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ ut_a(err == DB_OUT_OF_FILE_SPACE);
+
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+
+ row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ ut_error;
+ } else {
+ ibool is_path;
+ const char* name_or_path;
+
+ space_id = table->space;
+
+ if (table->dir_path_of_temp_table != NULL) {
+ dir_path_of_temp_table =
+ mem_strdup(table->dir_path_of_temp_table);
+ is_path = TRUE;
+ name_or_path = dir_path_of_temp_table;
+ } else {
+ is_path = FALSE;
+ name_or_path = name;
+ }
+
+ dict_table_remove_from_cache(table);
+
+ if (dict_load_table(name) != NULL) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: not able to remove table ",
+ stderr);
+ ut_print_name(stderr, trx, name);
+ fputs(" from the dictionary cache!\n", stderr);
+ err = DB_ERROR;
+ }
+
+ /* Do not drop possible .ibd tablespace if something went
+ wrong: we do not want to delete valuable data of the user */
+
+ if (err == DB_SUCCESS && space_id > 0) {
+ if (!fil_space_for_table_exists_in_mem(space_id,
+ name_or_path,
+ is_path,
+ FALSE, TRUE)) {
+ err = DB_SUCCESS;
+
+ fprintf(stderr,
+"InnoDB: We removed now the InnoDB internal data dictionary entry\n"
+"InnoDB: of table ");
+ ut_print_name(stderr, trx, name);
+ fprintf(stderr, ".\n");
+
+ goto funct_exit;
+ }
+
+ success = fil_delete_tablespace(space_id);
+
+ if (!success) {
+ fprintf(stderr,
+"InnoDB: We removed now the InnoDB internal data dictionary entry\n"
+"InnoDB: of table ");
+ ut_print_name(stderr, trx, name);
+ fprintf(stderr, ".\n");
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: not able to delete tablespace %lu of table ",
+ (ulong) space_id);
+ ut_print_name(stderr, trx, name);
+ fputs("!\n", stderr);
+ err = DB_ERROR;
+ }
+ }
+ }
+funct_exit:
+
+ if (locked_dictionary) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ if (dir_path_of_temp_table) {
+ mem_free(dir_path_of_temp_table);
+ }
+
+ que_graph_free(graph);
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = "";
+
+#ifndef UNIV_HOTBACKUP
+ srv_wake_master_thread();
+#endif /* !UNIV_HOTBACKUP */
+
+ return((int) err);
+}
+
+/*************************************************************************
+Drops a database for MySQL. */
+
+int
+row_drop_database_for_mysql(
+/*========================*/
+ /* out: error code or DB_SUCCESS */
+ const char* name, /* in: database name which ends to '/' */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_table_t* table;
+ char* table_name;
+ int err = DB_SUCCESS;
+ ulint namelen = strlen(name);
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_a(name != NULL);
+ ut_a(name[namelen - 1] == '/');
+
+ trx->op_info = "dropping database";
+
+ trx_start_if_not_started(trx);
+loop:
+ row_mysql_lock_data_dictionary(trx);
+
+ while ((table_name = dict_get_first_table_name_in_db(name))) {
+ ut_a(memcmp(table_name, name, namelen) == 0);
+
+ table = dict_table_get_low(table_name);
+
+ ut_a(table);
+
+ /* Wait until MySQL does not have any queries running on
+ the table */
+
+ if (table->n_mysql_handles_opened > 0) {
+ row_mysql_unlock_data_dictionary(trx);
+
+ ut_print_timestamp(stderr);
+ fputs(
+ " InnoDB: Warning: MySQL is trying to drop database ", stderr);
+ ut_print_name(stderr, trx, name);
+ fputs("\n"
+ "InnoDB: though there are still open handles to table ", stderr);
+ ut_print_name(stderr, trx, table_name);
+ fputs(".\n", stderr);
+
+ os_thread_sleep(1000000);
+
+ mem_free(table_name);
+
+ goto loop;
+ }
+
+ err = row_drop_table_for_mysql(table_name, trx, TRUE);
+
+ mem_free(table_name);
+
+ if (err != DB_SUCCESS) {
+ fputs("InnoDB: DROP DATABASE ", stderr);
+ ut_print_name(stderr, trx, name);
+ fprintf(stderr, " failed with error %lu for table ",
+ (ulint) err);
+ ut_print_name(stderr, trx, table_name);
+ putc('\n', stderr);
+ break;
+ }
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*************************************************************************
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL. */
+static
+ibool
+row_is_mysql_tmp_table_name(
+/*========================*/
+ /* out: TRUE if temporary table */
+ const char* name) /* in: table name in the form
+ 'database/tablename' */
+{
+ return(strstr(name, "/#sql") != NULL);
+}
+
+/*************************************************************************
+Renames a table for MySQL. */
+
+int
+row_rename_table_for_mysql(
+/*=======================*/
+ /* out: error code or DB_SUCCESS */
+ const char* old_name, /* in: old table name */
+ const char* new_name, /* in: new table name */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_table_t* table;
+ que_thr_t* thr;
+ que_t* graph = NULL;
+ ulint err;
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in deleting the dictionary data from system
+ tables in Innobase. Deleting a row from SYS_INDEXES table also
+ frees the file segments of the B-tree associated with the index. */
+ static const char str1[] =
+ "PROCEDURE RENAME_TABLE_PROC () IS\n"
+ "new_table_name CHAR;\n"
+ "old_table_name CHAR;\n"
+ "gen_constr_prefix CHAR;\n"
+ "new_db_name CHAR;\n"
+ "foreign_id CHAR;\n"
+ "new_foreign_id CHAR;\n"
+ "old_db_name_len INT;\n"
+ "old_t_name_len INT;\n"
+ "new_db_name_len INT;\n"
+ "id_len INT;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "new_table_name := '";
+ static const char str2[] =
+ "';\nold_table_name := '";
+ static const char str3[] =
+ "';\n"
+ "UPDATE SYS_TABLES SET NAME = new_table_name\n"
+ "WHERE NAME = old_table_name;\n";
+ static const char str4a1[] = /* drop some constraints of tmp tables */
+ "DELETE FROM SYS_FOREIGN_COLS WHERE ID = '";
+ static const char str4a2[] = "';\n"
+ "DELETE FROM SYS_FOREIGN WHERE ID = '";
+ static const char str4a3[] = "';\n";
+ static const char str4b[] = /* rename all constraints */
+ "found := 1;\n"
+ "old_db_name_len := INSTR(old_table_name, '/') - 1;\n"
+ "new_db_name_len := INSTR(new_table_name, '/') - 1;\n"
+ "new_db_name := SUBSTR(new_table_name, 0, new_db_name_len);\n"
+ "old_t_name_len := LENGTH(old_table_name);\n"
+ "gen_constr_prefix := CONCAT(old_table_name, '_ibfk_');\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO foreign_id\n"
+ " FROM SYS_FOREIGN\n"
+ " WHERE FOR_NAME = old_table_name\n"
+ " AND TO_BINARY(FOR_NAME) = TO_BINARY(old_table_name);\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET FOR_NAME = new_table_name\n"
+ " WHERE ID = foreign_id;\n"
+ " id_len := LENGTH(foreign_id);\n"
+ " IF (INSTR(foreign_id, '/') > 0) THEN\n"
+ " IF (INSTR(foreign_id,\n"
+ " gen_constr_prefix) > 0)\n"
+ " THEN\n"
+ " new_foreign_id :=\n"
+ " CONCAT(new_table_name,\n"
+ " SUBSTR(foreign_id, old_t_name_len,\n"
+ " id_len - old_t_name_len));\n"
+ " ELSE\n"
+ " new_foreign_id :=\n"
+ " CONCAT(new_db_name,\n"
+ " SUBSTR(foreign_id,\n"
+ " old_db_name_len,\n"
+ " id_len - old_db_name_len));\n"
+ " END IF;\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " UPDATE SYS_FOREIGN_COLS\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "UPDATE SYS_FOREIGN SET REF_NAME = new_table_name\n"
+ "WHERE REF_NAME = old_table_name\n"
+ " AND TO_BINARY(REF_NAME) = TO_BINARY(old_table_name);\n";
+ static const char str5[] =
+ "END;\n";
+
+ mem_heap_t* heap = NULL;
+ const char** constraints_to_drop = NULL;
+ ulint n_constraints_to_drop = 0;
+ ibool recovering_temp_table = FALSE;
+ ulint len;
+ ulint i;
+ ibool success;
+ /* length of database name; 0 if not renaming to a temporary table */
+ ulint db_name_len;
+ char* sql;
+ char* sqlend;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_a(old_name != NULL);
+ ut_a(new_name != NULL);
+
+ if (srv_created_new_raw || srv_force_recovery) {
+ fputs(
+ "InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ trx_commit_for_mysql(trx);
+ return(DB_ERROR);
+ }
+
+ if (row_mysql_is_system_table(new_name)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to create a MySQL system table %s of type InnoDB.\n"
+ "InnoDB: MySQL system tables must be of the MyISAM type!\n",
+ new_name);
+
+ trx_commit_for_mysql(trx);
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "renaming table";
+ trx_start_if_not_started(trx);
+
+ if (row_mysql_is_recovered_tmp_table(new_name)) {
+
+ recovering_temp_table = TRUE;
+ } else {
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ table = dict_table_get_low(old_name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, old_name);
+ fputs(" does not exist in the InnoDB internal\n"
+ "InnoDB: data dictionary though MySQL is trying to rename the table.\n"
+ "InnoDB: Have you copied the .frm file of the table to the\n"
+ "InnoDB: MySQL database directory from another database?\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+ "InnoDB_troubleshooting_datadict.html\n", stderr);
+ goto funct_exit;
+ }
+
+ if (table->ibd_file_missing) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, old_name);
+ fputs(
+ " does not have an .ibd file in the database directory.\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+ "InnoDB_troubleshooting_datadict.html\n", stderr);
+ goto funct_exit;
+ }
+
+ /* calculate the length of the SQL string */
+ len = (sizeof str1) + (sizeof str2) + (sizeof str3) + (sizeof str5) - 4
+ + ut_strlenq(new_name, '\'') + ut_strlenq(old_name, '\'');
+
+ if (row_is_mysql_tmp_table_name(new_name)) {
+ db_name_len = dict_get_db_name_len(old_name) + 1;
+
+ /* MySQL is doing an ALTER TABLE command and it renames the
+ original table to a temporary table name. We want to preserve
+ the original foreign key constraint definitions despite the
+ name change. An exception is those constraints for which
+ the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+ heap = mem_heap_create(100);
+
+ err = dict_foreign_parse_drop_constraints(heap, trx,
+ table,
+ &n_constraints_to_drop,
+ &constraints_to_drop);
+ if (err != DB_SUCCESS) {
+
+ goto funct_exit;
+ }
+
+ /* reserve space for all database names */
+ len += 2 * n_constraints_to_drop
+ * (ut_strlenq(old_name, '\'')
+ - ut_strlenq(old_name + db_name_len, '\''));
+
+ for (i = 0; i < n_constraints_to_drop; i++) {
+ ulint addlen
+ = 2 * ut_strlenq(constraints_to_drop[i], '\'')
+ + ((sizeof str4a1) + (sizeof str4a2)
+ + (sizeof str4a3) - 3);
+ if (!strchr(constraints_to_drop[i], '/')) {
+ addlen *= 2;
+ }
+ len += addlen;
+ }
+ } else {
+ db_name_len = 0;
+ len += (sizeof str4b) - 1;
+ }
+
+ sql = sqlend = mem_alloc(len + 1);
+ memcpy(sql, str1, (sizeof str1) - 1);
+ sqlend += (sizeof str1) - 1;
+ sqlend = ut_strcpyq(sqlend, '\'', new_name);
+ memcpy(sqlend, str2, (sizeof str2) - 1);
+ sqlend += (sizeof str2) - 1;
+ sqlend = ut_strcpyq(sqlend, '\'', old_name);
+ memcpy(sqlend, str3, (sizeof str3) - 1);
+ sqlend += (sizeof str3) - 1;
+
+ if (db_name_len) {
+ /* Internally, old format < 4.0.18 constraints have as the
+ constraint id <number>_<number>, while new format constraints
+ have <databasename>/<constraintname>. */
+
+ for (i = 0; i < n_constraints_to_drop; i++) {
+ memcpy(sqlend, str4a1, (sizeof str4a1) - 1);
+ sqlend += (sizeof str4a1) - 1;
+ sqlend = ut_memcpyq(sqlend, '\'',
+ old_name, db_name_len);
+ sqlend = ut_strcpyq(sqlend, '\'',
+ constraints_to_drop[i]);
+ memcpy(sqlend, str4a2, (sizeof str4a2) - 1);
+ sqlend += (sizeof str4a2) - 1;
+ sqlend = ut_memcpyq(sqlend, '\'',
+ old_name, db_name_len);
+ sqlend = ut_strcpyq(sqlend, '\'',
+ constraints_to_drop[i]);
+ memcpy(sqlend, str4a3, (sizeof str4a3) - 1);
+ sqlend += (sizeof str4a3) - 1;
+
+ if (!strchr(constraints_to_drop[i], '/')) {
+ /* If this happens to be an old format
+ constraint, let us delete it. Since all new
+ format constraints contain '/', it does no
+ harm to run these DELETEs anyway. */
+
+ memcpy(sqlend, str4a1, (sizeof str4a1) - 1);
+ sqlend += (sizeof str4a1) - 1;
+ sqlend = ut_strcpyq(sqlend, '\'',
+ constraints_to_drop[i]);
+ memcpy(sqlend, str4a2, (sizeof str4a2) - 1);
+ sqlend += (sizeof str4a2) - 1;
+ sqlend = ut_strcpyq(sqlend, '\'',
+ constraints_to_drop[i]);
+ memcpy(sqlend, str4a3, (sizeof str4a3) - 1);
+ sqlend += (sizeof str4a3) - 1;
+ }
+ }
+ }
+ else {
+ memcpy(sqlend, str4b, (sizeof str4b) - 1);
+ sqlend += (sizeof str4b) - 1;
+ }
+
+ memcpy(sqlend, str5, sizeof str5);
+ sqlend += sizeof str5;
+
+ ut_a(sqlend == sql + len + 1);
+
+ graph = pars_sql(sql);
+
+ ut_a(graph);
+ mem_free(sql);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DUPLICATE_KEY) {
+ ut_print_timestamp(stderr);
+ fputs(
+ " InnoDB: Error; possible reasons:\n"
+ "InnoDB: 1) Table rename would cause two FOREIGN KEY constraints\n"
+ "InnoDB: to have the same internal name in case-insensitive comparison.\n"
+ "InnoDB: 2) table ", stderr);
+ ut_print_name(stderr, trx, new_name);
+ fputs(" exists in the InnoDB internal data\n"
+ "InnoDB: dictionary though MySQL is trying rename table ", stderr);
+ ut_print_name(stderr, trx, old_name);
+ fputs(" to it.\n"
+ "InnoDB: Have you deleted the .frm file and not used DROP TABLE?\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: http://dev.mysql.com/doc/mysql/en/"
+ "InnoDB_troubleshooting_datadict.html\n"
+ "InnoDB: If table ", stderr);
+ ut_print_name(stderr, trx, new_name);
+ fputs(
+ " is a temporary table #sql..., then it can be that\n"
+ "InnoDB: there are still queries running on the table, and it will be\n"
+ "InnoDB: dropped automatically when the queries end.\n"
+ "InnoDB: You can drop the orphaned table inside InnoDB by\n"
+ "InnoDB: creating an InnoDB table with the same name in another\n"
+ "InnoDB: database and copying the .frm file to the current database.\n"
+ "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n"
+ "InnoDB: succeed.\n", stderr);
+ }
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+ } else {
+ /* The following call will also rename the .ibd data file if
+ the table is stored in a single-table tablespace */
+
+ success = dict_table_rename_in_cache(table, new_name,
+ !row_is_mysql_tmp_table_name(new_name));
+ if (!success) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error in table rename, cannot rename ",
+ stderr);
+ ut_print_name(stderr, trx, old_name);
+ fputs(" to ", stderr);
+ ut_print_name(stderr, trx, new_name);
+ putc('\n', stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ err = dict_load_foreigns(new_name, trx->check_foreigns);
+
+ if (row_is_mysql_tmp_table_name(old_name)) {
+
+ /* MySQL is doing an ALTER TABLE command and it
+ renames the created temporary table to the name
+ of the original table. In the ALTER TABLE we maybe
+ created some FOREIGN KEY constraints for the temporary
+ table. But we want to load also the foreign key
+ constraint definitions for the original table name. */
+
+ if (err != DB_SUCCESS) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: in ALTER TABLE ",
+ stderr);
+ ut_print_name(stderr, trx, new_name);
+ fputs("\n"
+ "InnoDB: has or is referenced in foreign key constraints\n"
+ "InnoDB: which are not compatible with the new table definition.\n",
+ stderr);
+
+ ut_a(dict_table_rename_in_cache(table,
+ old_name, FALSE));
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE,
+ NULL);
+ trx->error_state = DB_SUCCESS;
+ }
+ } else {
+ if (err != DB_SUCCESS) {
+
+ ut_print_timestamp(stderr);
+
+ fputs(
+ " InnoDB: Error: in RENAME TABLE table ",
+ stderr);
+ ut_print_name(stderr, trx, new_name);
+ fputs("\n"
+ "InnoDB: is referenced in foreign key constraints\n"
+ "InnoDB: which are not compatible with the new table definition.\n",
+ stderr);
+
+ ut_a(dict_table_rename_in_cache(table,
+ old_name, FALSE));
+
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE,
+ NULL);
+ trx->error_state = DB_SUCCESS;
+ }
+ }
+ }
+funct_exit:
+ if (!recovering_temp_table) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ if (graph) {
+ que_graph_free(graph);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*************************************************************************
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction. */
+static
+ibool
+row_scan_and_check_index(
+/*=====================*/
+ /* out: TRUE if ok */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL */
+ dict_index_t* index, /* in: index */
+ ulint* n_rows) /* out: number of entries seen in the
+ current consistent read */
+{
+ dtuple_t* prev_entry = NULL;
+ ulint matched_fields;
+ ulint matched_bytes;
+ byte* buf;
+ ulint ret;
+ rec_t* rec;
+ ibool is_ok = TRUE;
+ int cmp;
+ ibool contains_null;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ *n_rows = 0;
+
+ buf = mem_alloc(UNIV_PAGE_SIZE);
+ heap = mem_heap_create(100);
+
+ /* Make a dummy template in prebuilt, which we will use
+ in scanning the index entries */
+
+ prebuilt->index = index;
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+ prebuilt->n_template = 0;
+ prebuilt->need_to_access_clustered = FALSE;
+
+ dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+ prebuilt->select_lock_type = LOCK_NONE;
+
+ ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+ if (ret != DB_SUCCESS) {
+
+ mem_free(buf);
+ mem_heap_free(heap);
+
+ return(is_ok);
+ }
+
+ *n_rows = *n_rows + 1;
+
+ /* row_search... returns the index record in buf, record origin offset
+ within buf stored in the first 4 bytes, because we have built a dummy
+ template */
+
+ rec = buf + mach_read_from_4(buf);
+
+ if (prev_entry != NULL) {
+ matched_fields = 0;
+ matched_bytes = 0;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+ &matched_fields,
+ &matched_bytes);
+ contains_null = FALSE;
+
+ /* In a unique secondary index we allow equal key values if
+ they contain SQL NULLs */
+
+ for (i = 0;
+ i < dict_index_get_n_ordering_defined_by_user(index);
+ i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(prev_entry, i))) {
+
+ contains_null = TRUE;
+ }
+ }
+
+ if (cmp > 0) {
+ fputs("InnoDB: index records in a wrong order in ",
+ stderr);
+ not_ok:
+ dict_index_name_print(stderr,
+ prebuilt->trx, index);
+ fputs("\n"
+ "InnoDB: prev record ", stderr);
+ dtuple_print(stderr, prev_entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ is_ok = FALSE;
+ } else if ((index->type & DICT_UNIQUE)
+ && !contains_null
+ && matched_fields >=
+ dict_index_get_n_ordering_defined_by_user(index)) {
+
+ fputs("InnoDB: duplicate key in ", stderr);
+ goto not_ok;
+ }
+ }
+
+ mem_heap_empty(heap);
+ offsets = offsets_;
+
+ prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
+
+ ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+ goto loop;
+}
+
+/*************************************************************************
+Checks a table for corruption. */
+
+ulint
+row_check_table_for_mysql(
+/*======================*/
+ /* out: DB_ERROR or DB_SUCCESS */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ dict_table_t* table = prebuilt->table;
+ dict_index_t* index;
+ ulint n_rows;
+ ulint n_rows_in_table = ULINT_UNDEFINED;
+ ulint ret = DB_SUCCESS;
+ ulint old_isolation_level;
+
+ if (prebuilt->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ return(DB_ERROR);
+ }
+
+ prebuilt->trx->op_info = "checking table";
+
+ old_isolation_level = prebuilt->trx->isolation_level;
+
+ /* We must run the index record counts at an isolation level
+ >= READ COMMITTED, because a dirty read can see a wrong number
+ of records in some index; to play safe, we use always
+ REPEATABLE READ here */
+
+ prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ /* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ /* fputs("Validating index ", stderr);
+ ut_print_name(stderr, index->name);
+ putc('\n', stderr); */
+
+ if (!btr_validate_tree(index->tree)) {
+ ret = DB_ERROR;
+ } else {
+ if (!row_scan_and_check_index(prebuilt,
+ index, &n_rows)) {
+ ret = DB_ERROR;
+ }
+
+ /* fprintf(stderr, "%lu entries in index %s\n", n_rows,
+ index->name); */
+
+ if (index == dict_table_get_first_index(table)) {
+ n_rows_in_table = n_rows;
+ } else if (n_rows != n_rows_in_table) {
+
+ ret = DB_ERROR;
+
+ fputs("Error: ", stderr);
+ dict_index_name_print(stderr,
+ prebuilt->trx, index);
+ fprintf(stderr,
+ " contains %lu entries, should be %lu\n",
+ (ulong) n_rows,
+ (ulong) n_rows_in_table);
+ }
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ /* Restore the original isolation level */
+ prebuilt->trx->isolation_level = old_isolation_level;
+
+ /* We validate also the whole adaptive hash index for all tables
+ at every CHECK TABLE */
+
+ if (!btr_search_validate()) {
+
+ ret = DB_ERROR;
+ }
+
+ /* Restore the fatal lock wait timeout after CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ prebuilt->trx->op_info = "";
+
+ return(ret);
+}
diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.c
new file mode 100644
index 00000000000..5893e016011
--- /dev/null
+++ b/storage/innobase/row/row0purge.c
@@ -0,0 +1,671 @@
+/******************************************************
+Purge obsolete records
+
+(c) 1997 Innobase Oy
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+
+/************************************************************************
+Creates a purge node to a query graph. */
+
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+ /* out, own: purge node */
+ que_thr_t* parent, /* in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /* in: memory heap where created */
+{
+ purge_node_t* node;
+
+ ut_ad(parent && heap);
+
+ node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+ node->common.type = QUE_NODE_PURGE;
+ node->common.parent = parent;
+
+ node->heap = mem_heap_create(256);
+
+ return(node);
+}
+
+/***************************************************************
+Repositions the pcur in the purge node on the clustered index record,
+if found. */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+ /* out: TRUE if the record was found */
+ ulint mode, /* in: latching mode */
+ purge_node_t* node, /* in: row purge node */
+ mtr_t* mtr) /* in: mtr */
+{
+ ibool found;
+
+ if (node->found_clust) {
+ found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+ return(found);
+ }
+
+ found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+ node->ref, mtr);
+ node->found_clust = found;
+
+ if (found) {
+ btr_pcur_store_position(&(node->pcur), mtr);
+ }
+
+ return(found);
+}
+
+/***************************************************************
+Removes a delete marked clustered index record if possible. */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+ /* out: TRUE if success, or if not found, or
+ if modified after the delete marking */
+ purge_node_t* node, /* in: row purge node */
+ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ulint err;
+ mtr_t mtr;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ index = dict_table_get_first_index(node->table);
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ mtr_start(&mtr);
+
+ success = row_purge_reposition_pcur(mode, node, &mtr);
+
+ if (!success) {
+ /* The record is already removed */
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(TRUE);
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (0 != ut_dulint_cmp(node->roll_ptr,
+ row_get_rec_roll_ptr(rec, index, rec_get_offsets(
+ rec, index, offsets_, ULINT_UNDEFINED, &heap)))) {
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ /* Someone else has modified the record later: do not remove */
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(TRUE);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, FALSE, &mtr);
+
+ if (err == DB_SUCCESS) {
+ success = TRUE;
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+ success = FALSE;
+ } else {
+ ut_error;
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(success);
+}
+
+/***************************************************************
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+ purge_node_t* node) /* in: row purge node */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+/* fputs("Purge: Removing clustered record\n", stderr); */
+
+ success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF);
+ if (success) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible. */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+ /* out: TRUE if success or if not found */
+ purge_node_t* node, /* in: row purge node */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode) /* in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has = 0; /* remove warning */
+ ibool found;
+ ulint err;
+ mtr_t mtr;
+ mtr_t* mtr_vers;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ if (!found) {
+ /* Not found */
+
+ /* fputs("PURGE:........sec entry not found\n", stderr); */
+ /* dtuple_print(entry); */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ /* We should remove the index record if no later version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should do nothing. */
+
+ mtr_vers = mem_alloc(sizeof(mtr_t));
+
+ mtr_start(mtr_vers);
+
+ success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, mtr_vers);
+
+ if (success) {
+ old_has = row_vers_old_has_index_entry(TRUE,
+ btr_pcur_get_rec(&(node->pcur)),
+ mtr_vers, index, entry);
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), mtr_vers);
+
+ mem_free(mtr_vers);
+
+ if (!success || !old_has) {
+ /* Remove the index record */
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ FALSE, &mtr);
+ if (err == DB_SUCCESS) {
+ success = TRUE;
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+ success = FALSE;
+ } else {
+ ut_error;
+ }
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+ purge_node_t* node, /* in: row purge node */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+/* fputs("Purge: Removing secondary record\n", stderr); */
+
+ success = row_purge_remove_sec_if_poss_low(node, index, entry,
+ BTR_MODIFY_LEAF);
+ if (success) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_sec_if_poss_low(node, index, entry,
+ BTR_MODIFY_TREE);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***************************************************************
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+ purge_node_t* node) /* in: row purge node */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+
+ ut_ad(node);
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ /* Build the index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_purge_remove_sec_if_poss(node, index, entry);
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ row_purge_remove_clust_if_poss(node);
+}
+
+/***************************************************************
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern(
+/*==========================*/
+ purge_node_t* node) /* in: row purge node */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ upd_field_t* ufield;
+ ibool is_insert;
+ ulint rseg_id;
+ ulint page_no;
+ ulint offset;
+ ulint internal_offset;
+ byte* data_field;
+ ulint data_field_len;
+ ulint i;
+ mtr_t mtr;
+
+ ut_ad(node);
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+ goto skip_secondaries;
+ }
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ if (row_upd_changes_ord_field_binary(NULL, node->index,
+ node->update)) {
+ /* Build the older version of the index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_purge_remove_sec_if_poss(node, index, entry);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+skip_secondaries:
+ /* Free possible externally stored fields */
+ for (i = 0; i < upd_get_n_fields(node->update); i++) {
+
+ ufield = upd_get_nth_field(node->update, i);
+
+ if (ufield->extern_storage) {
+ /* We use the fact that new_val points to
+ node->undo_rec and get thus the offset of
+ dfield data inside the unod record. Then we
+ can calculate from node->roll_ptr the file
+ address of the new_val data */
+
+ internal_offset = ((byte*)ufield->new_val.data)
+ - node->undo_rec;
+
+ ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+ trx_undo_decode_roll_ptr(node->roll_ptr,
+ &is_insert, &rseg_id,
+ &page_no, &offset);
+ mtr_start(&mtr);
+
+ /* We have to acquire an X-latch to the clustered
+ index tree */
+
+ index = dict_table_get_first_index(node->table);
+
+ mtr_x_lock(dict_tree_get_lock(index->tree), &mtr);
+
+ /* NOTE: we must also acquire an X-latch to the
+ root page of the tree. We will need it when we
+ free pages from the tree. If the tree is of height 1,
+ the tree X-latch does NOT protect the root page,
+ because it is also a leaf page. Since we will have a
+ latch on an undo log page, we would break the
+ latching order if we would only later latch the
+ root page of such a tree! */
+
+ btr_root_get(index->tree, &mtr);
+
+ /* We assume in purge of externally stored fields
+ that the space id of the undo log record is 0! */
+
+ data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr)
+ + offset + internal_offset;
+
+#ifdef UNIV_SYNC_DEBUG
+ buf_page_dbg_add_level(buf_frame_align(data_field),
+ SYNC_TRX_UNDO_PAGE);
+#endif /* UNIV_SYNC_DEBUG */
+
+ data_field_len = ufield->new_val.len;
+
+ btr_free_externally_stored_field(index, data_field,
+ data_field_len, FALSE, &mtr);
+ mtr_commit(&mtr);
+ }
+ }
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+ /* out: TRUE if purge operation required:
+ NOTE that then the CALLER must unfreeze
+ data dictionary! */
+ purge_node_t* node, /* in: row undo node */
+ ibool* updated_extern,
+ /* out: TRUE if an externally stored field
+ was updated */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ trx_t* trx;
+ dulint undo_no;
+ dulint table_id;
+ dulint trx_id;
+ dulint roll_ptr;
+ ulint info_bits;
+ ulint type;
+ ulint cmpl_info;
+
+ ut_ad(node && thr);
+
+ trx = thr_get_trx(thr);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ updated_extern, &undo_no, &table_id);
+ node->rec_type = type;
+
+ if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+
+ return(FALSE);
+ }
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+ node->table = NULL;
+
+ if (type == TRX_UNDO_UPD_EXIST_REC
+ && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
+
+ /* Purge requires no changes to indexes: we may return */
+
+ return(FALSE);
+ }
+
+ /* Prevent DROP TABLE etc. from running when we are doing the purge
+ for this row */
+
+ row_mysql_freeze_data_dictionary(trx);
+
+ mutex_enter(&(dict_sys->mutex));
+
+ node->table = dict_table_get_on_id_low(table_id, trx);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ if (node->table == NULL) {
+ /* The table has been dropped: no need to do purge */
+
+ row_mysql_unfreeze_data_dictionary(trx);
+
+ return(FALSE);
+ }
+
+ if (node->table->ibd_file_missing) {
+ /* We skip purge of missing .ibd files */
+
+ node->table = NULL;
+
+ row_mysql_unfreeze_data_dictionary(trx);
+
+ return(FALSE);
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (clust_index == NULL) {
+ /* The table was corrupt in the data dictionary */
+
+ row_mysql_unfreeze_data_dictionary(trx);
+
+ return(FALSE);
+ }
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits, trx,
+ node->heap, &(node->update));
+
+ /* Read to the partial row the fields that occur in indexes */
+
+ if (!cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ ptr = trx_undo_rec_get_partial_row(ptr, clust_index,
+ &(node->row), node->heap);
+ }
+
+ return(TRUE);
+}
+
+/***************************************************************
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_purge(
+/*======*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code */
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dulint roll_ptr;
+ ibool purge_needed;
+ ibool updated_extern;
+ trx_t* trx;
+
+ ut_ad(node && thr);
+
+ trx = thr_get_trx(thr);
+
+ node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+ &(node->reservation),
+ node->heap);
+ if (!node->undo_rec) {
+ /* Purge completed for this query thread */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(DB_SUCCESS);
+ }
+
+ node->roll_ptr = roll_ptr;
+
+ if (node->undo_rec == &trx_purge_dummy_rec) {
+ purge_needed = FALSE;
+ } else {
+ purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
+ thr);
+ /* If purge_needed == TRUE, we must also remember to unfreeze
+ data dictionary! */
+ }
+
+ if (purge_needed) {
+ node->found_clust = FALSE;
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+ row_purge_del_mark(node);
+
+ } else if (updated_extern
+ || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+ row_purge_upd_exist_or_extern(node);
+ }
+
+ if (node->found_clust) {
+ btr_pcur_close(&(node->pcur));
+ }
+
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ /* Do some cleanup */
+ trx_purge_rec_release(node->reservation);
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph. */
+
+que_thr_t*
+row_purge_step(
+/*===========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ purge_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+ err = row_purge(node, thr);
+
+ ut_ad(err == DB_SUCCESS);
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c
new file mode 100644
index 00000000000..a6d3f1d5ab0
--- /dev/null
+++ b/storage/innobase/row/row0row.c
@@ -0,0 +1,730 @@
+/******************************************************
+General row routines
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+
+/*************************************************************************
+Reads the trx id or roll ptr field from a clustered index record: this function
+is slower than the specialized inline functions. */
+
+dulint
+row_get_rec_sys_field(
+/*==================*/
+ /* out: value of the field */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ const ulint* offsets)/* in: rec_get_offsets(rec, index) */
+{
+ ulint pos;
+ byte* field;
+ ulint len;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ if (type == DATA_TRX_ID) {
+
+ return(trx_read_trx_id(field));
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+
+ return(trx_read_roll_ptr(field));
+ }
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field in a clustered index record: this function
+is slower than the specialized inline functions. */
+
+void
+row_set_rec_sys_field(
+/*==================*/
+ /* out: value of the field */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ dulint val) /* in: value to set */
+{
+ ulint pos;
+ byte* field;
+ ulint len;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ if (type == DATA_TRX_ID) {
+
+ trx_write_trx_id(field, val);
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+
+ trx_write_roll_ptr(field, val);
+ }
+}
+
+/*********************************************************************
+When an insert to a table is performed, this function builds the entry which
+has to be inserted to an index on the table. */
+
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ /* out: index entry which should be inserted */
+ dtuple_t* row, /* in: row which should be inserted to the
+ table */
+ dict_index_t* index, /* in: index on the table */
+ mem_heap_t* heap) /* in: memory heap from which the memory for
+ the index entry is allocated */
+{
+ dtuple_t* entry;
+ ulint entry_len;
+ dict_field_t* ind_field;
+ dfield_t* dfield;
+ dfield_t* dfield2;
+ dict_col_t* col;
+ ulint i;
+ ulint storage_len;
+ dtype_t* cur_type;
+
+ ut_ad(row && index && heap);
+ ut_ad(dtuple_check_typed(row));
+
+ entry_len = dict_index_get_n_fields(index);
+ entry = dtuple_create(heap, entry_len);
+
+ if (index->type & DICT_UNIVERSAL) {
+ dtuple_set_n_fields_cmp(entry, entry_len);
+ } else {
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ }
+
+ for (i = 0; i < entry_len; i++) {
+ ind_field = dict_index_get_nth_field(index, i);
+ col = ind_field->col;
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_copy(dfield, dfield2);
+
+ /* If a column prefix index, take only the prefix */
+ if (ind_field->prefix_len > 0
+ && dfield_get_len(dfield2) != UNIV_SQL_NULL) {
+
+ cur_type = dict_col_get_type(
+ dict_field_get_col(ind_field));
+
+ storage_len = dtype_get_at_most_n_mbchars(
+ cur_type,
+ ind_field->prefix_len,
+ dfield_get_len(dfield2), dfield2->data);
+
+ dfield_set_len(dfield, storage_len);
+ }
+ }
+
+ ut_ad(dtuple_check_typed(entry));
+
+ return(entry);
+}
+
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+dtuple_t*
+row_build(
+/*======*/
+ /* out, own: row built; see the NOTE below! */
+ ulint type, /* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or
+ ROW_COPY_ALSO_EXTERNALS,
+ the two last copy also the data fields to
+ heap as the first only places pointers to
+ data fields on the index page, and thus is
+ more efficient */
+ dict_index_t* index, /* in: clustered index */
+ rec_t* rec, /* in: record in the clustered index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row dtuple is used! */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ dict_field_t* ind_field;
+ dict_col_t* col;
+ dfield_t* dfield;
+ ulint n_fields;
+ byte* field;
+ ulint len;
+ ulint row_len;
+ byte* buf;
+ ulint i;
+ mem_heap_t* tmp_heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(index && rec && heap);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &tmp_heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ if (type != ROW_COPY_POINTERS) {
+ /* Take a copy of rec to heap */
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ rec = rec_copy(buf, rec, offsets);
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(rec, index, (ulint*) offsets);
+ }
+
+ table = index->table;
+ row_len = dict_table_get_n_cols(table);
+
+ row = dtuple_create(heap, row_len);
+
+ dtuple_set_info_bits(row, rec_get_info_bits(rec, table->comp));
+
+ n_fields = rec_offs_n_fields(offsets);
+
+ dict_table_copy_types(row, table);
+
+ for (i = 0; i < n_fields; i++) {
+ ind_field = dict_index_get_nth_field(index, i);
+
+ if (ind_field->prefix_len == 0) {
+
+ col = dict_field_get_col(ind_field);
+ dfield = dtuple_get_nth_field(row,
+ dict_col_get_no(col));
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (type == ROW_COPY_ALSO_EXTERNALS
+ && rec_offs_nth_extern(offsets, i)) {
+
+ field = btr_rec_copy_externally_stored_field(
+ rec, offsets, i, &len, heap);
+ }
+
+ dfield_set_data(dfield, field, len);
+ }
+ }
+
+ ut_ad(dtuple_check_typed(row));
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(row);
+}
+
+/***********************************************************************
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap. */
+
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ /* out, own: index entry built; see the
+ NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap as the latter only places pointers to
+ data fields on the index page */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the dtuple is used! */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dtuple_t* entry;
+ dfield_t* dfield;
+ ulint i;
+ byte* field;
+ ulint len;
+ ulint rec_len;
+ byte* buf;
+ mem_heap_t* tmp_heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(rec && heap && index);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &tmp_heap);
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ rec = rec_copy(buf, rec, offsets);
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(rec, index, offsets);
+ }
+
+ rec_len = rec_offs_n_fields(offsets);
+
+ entry = dtuple_create(heap, rec_len);
+
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ ut_ad(rec_len == dict_index_get_n_fields(index));
+
+ dict_index_copy_types(entry, index, rec_len);
+
+ dtuple_set_info_bits(entry,
+ rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+ for (i = 0; i < rec_len; i++) {
+
+ dfield = dtuple_get_nth_field(entry, i);
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ ut_ad(dtuple_check_typed(entry));
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(entry);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ /* out, own: row reference built; see the
+ NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dtuple_t* ref;
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ byte* buf;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* tmp_heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(index && rec && heap);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &tmp_heap);
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+
+ rec = rec_copy(buf, rec, offsets);
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(rec, index, offsets);
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len =
+ dict_index_get_nth_field(clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dfield_get_type(dfield),
+ clust_col_prefix_len, len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(ref);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /* in/out: row reference built; see the
+ NOTE below! */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: the data fields in ref will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ trx_t* trx) /* in: transaction */
+{
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_a(ref && index && rec);
+
+ if (!index->table) {
+ fputs("InnoDB: table ", stderr);
+ notfound:
+ ut_print_name(stderr, trx, index->table_name);
+ fputs(" for index ", stderr);
+ ut_print_name(stderr, trx, index->name);
+ fputs(" not found\n", stderr);
+ ut_error;
+ }
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ if (!clust_index) {
+ fputs("InnoDB: clust index for table ", stderr);
+ goto notfound;
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len =
+ dict_index_get_nth_field(clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dfield_get_type(dfield),
+ clust_col_prefix_len, len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (heap) {
+ mem_heap_free(heap);
+ }
+}
+
+/***********************************************************************
+From a row build a row reference with which we can search the clustered
+index record. */
+
+void
+row_build_row_ref_from_row(
+/*=======================*/
+ dtuple_t* ref, /* in/out: row reference built; see the
+ NOTE below! ref must have the right number
+ of fields! */
+ dict_table_t* table, /* in: table */
+ dtuple_t* row) /* in: row
+ NOTE: the data fields in ref will point
+ directly into data of this row */
+{
+ dict_index_t* clust_index;
+ dict_field_t* field;
+ dfield_t* dfield;
+ dfield_t* dfield2;
+ dict_col_t* col;
+ ulint ref_len;
+ ulint i;
+ dtype_t* cur_type;
+
+ ut_ad(ref && table && row);
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ field = dict_index_get_nth_field(clust_index, i);
+
+ col = dict_field_get_col(field);
+
+ dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_copy(dfield, dfield2);
+
+ if (field->prefix_len > 0
+ && dfield->len != UNIV_SQL_NULL) {
+
+ cur_type = dict_col_get_type(
+ dict_field_get_col(field));
+
+ dfield->len = dtype_get_at_most_n_mbchars(
+ cur_type,
+ field->prefix_len,
+ dfield->len, dfield->data);
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+}
+
+/*******************************************************************
+Searches the clustered index record for a row, if we have the row reference. */
+
+ibool
+row_search_on_row_ref(
+/*==================*/
+ /* out: TRUE if found */
+ btr_pcur_t* pcur, /* in/out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ dict_table_t* table, /* in: table */
+ dtuple_t* ref, /* in: row reference */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint low_match;
+ rec_t* rec;
+ dict_index_t* index;
+ page_t* page;
+
+ ut_ad(dtuple_check_typed(ref));
+
+ index = dict_table_get_first_index(table);
+
+ ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index));
+
+ btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+ page = buf_frame_align(rec);
+
+ if (rec == page_get_infimum_rec(page)) {
+
+ return(FALSE);
+ }
+
+ if (low_match != dtuple_get_n_fields(ref)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved. */
+
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ /* out: record or NULL, if no record found */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ rec_t* rec, /* in: record in a secondary index */
+ dict_index_t* index, /* in: secondary index */
+ dict_index_t** clust_index,/* out: clustered index */
+ mtr_t* mtr) /* in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* ref;
+ dict_table_t* table;
+ btr_pcur_t pcur;
+ ibool found;
+ rec_t* clust_rec;
+
+ ut_ad((index->type & DICT_CLUSTERED) == 0);
+
+ table = index->table;
+
+ heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+ found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+ clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+ mem_heap_free(heap);
+
+ btr_pcur_close(&pcur);
+
+ *clust_index = dict_table_get_first_index(table);
+
+ return(clust_rec);
+}
+
+/*******************************************************************
+Searches an index record. */
+
+ibool
+row_search_index_entry(
+/*===================*/
+ /* out: TRUE if found */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /* in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint n_fields;
+ ulint low_match;
+ page_t* page;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(entry));
+
+ btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+ page = buf_frame_align(rec);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ if (rec == page_get_infimum_rec(page)) {
+
+ return(FALSE);
+ }
+
+ if (low_match != n_fields) {
+ /* Not found */
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.c
new file mode 100644
index 00000000000..94cf82d6a3d
--- /dev/null
+++ b/storage/innobase/row/row0sel.c
@@ -0,0 +1,4066 @@
+/*******************************************************
+Select
+
+(c) 1997 Innobase Oy
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "read0read.h"
+#include "buf0lru.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH 16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT 1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT 100
+
+/* Flags for search shortcut */
+#define SEL_FOUND 0
+#define SEL_EXHAUSTED 1
+#define SEL_RETRY 2
+
+/************************************************************************
+Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation! */
+static
+ibool
+row_sel_sec_rec_is_for_clust_rec(
+/*=============================*/
+ /* out: TRUE if the secondary
+ record is equal to the corresponding
+ fields in the clustered record,
+ when compared with collation */
+ rec_t* sec_rec, /* in: secondary index record */
+ dict_index_t* sec_index, /* in: secondary index */
+ rec_t* clust_rec, /* in: clustered index record */
+ dict_index_t* clust_index) /* in: clustered index */
+{
+ dict_field_t* ifield;
+ dict_col_t* col;
+ byte* sec_field;
+ ulint sec_len;
+ byte* clust_field;
+ ulint clust_len;
+ ulint n;
+ ulint i;
+ dtype_t* cur_type;
+ mem_heap_t* heap = NULL;
+ ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
+ ulint* clust_offs = clust_offsets_;
+ ulint* sec_offs = sec_offsets_;
+ ibool is_equal = TRUE;
+
+ *clust_offsets_ = (sizeof clust_offsets_) / sizeof *clust_offsets_;
+ *sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_;
+
+ clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+ ULINT_UNDEFINED, &heap);
+ sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+ ULINT_UNDEFINED, &heap);
+
+ n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+ for (i = 0; i < n; i++) {
+ ifield = dict_index_get_nth_field(sec_index, i);
+ col = dict_field_get_col(ifield);
+
+ clust_field = rec_get_nth_field(clust_rec, clust_offs,
+ dict_col_get_clust_pos(col),
+ &clust_len);
+ sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+ if (ifield->prefix_len > 0
+ && clust_len != UNIV_SQL_NULL) {
+
+ cur_type = dict_col_get_type(
+ dict_field_get_col(ifield));
+
+ clust_len = dtype_get_at_most_n_mbchars(
+ cur_type,
+ ifield->prefix_len,
+ clust_len, (char*) clust_field);
+ }
+
+ if (0 != cmp_data_data(dict_col_get_type(col),
+ clust_field, clust_len,
+ sec_field, sec_len)) {
+ is_equal = FALSE;
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(is_equal);
+}
+
+/*************************************************************************
+Creates a select node struct. */
+
+sel_node_t*
+sel_node_create(
+/*============*/
+ /* out, own: select node struct */
+ mem_heap_t* heap) /* in: memory heap where created */
+{
+ sel_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(sel_node_t));
+ node->common.type = QUE_NODE_SELECT;
+ node->state = SEL_NODE_OPEN;
+
+ node->select_will_do_update = FALSE;
+ node->latch_mode = BTR_SEARCH_LEAF;
+
+ node->plans = NULL;
+
+ return(node);
+}
+
+/*************************************************************************
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node) /* in: select node struct */
+{
+ ulint i;
+ plan_t* plan;
+
+ if (node->plans != NULL) {
+ for (i = 0; i < node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(node, i);
+
+ btr_pcur_close(&(plan->pcur));
+ btr_pcur_close(&(plan->clust_pcur));
+
+ if (plan->old_vers_heap) {
+ mem_heap_free(plan->old_vers_heap);
+ }
+ }
+ }
+}
+
+/*************************************************************************
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+ sel_node_t* node) /* in: select node */
+{
+ que_node_t* exp;
+
+ exp = node->select_list;
+
+ while (exp) {
+ eval_exp(exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*************************************************************************
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+ sym_node_t* var, /* in: first variable in a list of variables */
+ sel_node_t* node) /* in: select node */
+{
+ que_node_t* exp;
+
+ if (var == NULL) {
+
+ return;
+ }
+
+ exp = node->select_list;
+
+ while (var) {
+ ut_ad(exp);
+
+ eval_node_copy_val(var->alias, exp);
+
+ exp = que_node_get_next(exp);
+ var = que_node_get_next(var);
+ }
+}
+
+/*************************************************************************
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+ sel_node_t* node) /* in: select node */
+{
+ func_node_t* func_node;
+
+ ut_ad(node->is_aggregate);
+
+ func_node = node->select_list;
+
+ while (func_node) {
+ eval_node_set_int_val(func_node, 0);
+
+ func_node = que_node_get_next(func_node);
+ }
+
+ node->aggregate_already_fetched = FALSE;
+}
+
+/*************************************************************************
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+ sel_node_t* node) /* in: select node */
+{
+ sym_node_t* var;
+
+ var = UT_LIST_GET_FIRST(node->copy_variables);
+
+ while (var) {
+ eval_node_copy_val(var, var->alias);
+
+ var->indirection = NULL;
+
+ var = UT_LIST_GET_NEXT(col_var_list, var);
+ }
+}
+
+/*************************************************************************
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+ dict_index_t* index, /* in: record index */
+ rec_t* rec, /* in: record in a clustered or non-clustered
+ index */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ sym_node_t* column) /* in: first column in a column list, or
+ NULL */
+{
+ dfield_t* val;
+ ulint index_type;
+ ulint field_no;
+ byte* data;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (index->type & DICT_CLUSTERED) {
+ index_type = SYM_CLUST_FIELD_NO;
+ } else {
+ index_type = SYM_SEC_FIELD_NO;
+ }
+
+ while (column) {
+ field_no = column->field_nos[index_type];
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ data = rec_get_nth_field(rec, offsets, field_no, &len);
+
+ if (column->copy_val) {
+ eval_node_copy_and_alloc_val(column, data,
+ len);
+ } else {
+ val = que_node_get_val(column);
+ dfield_set_data(val, data, len);
+ }
+ }
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*************************************************************************
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+ sym_node_t* column) /* in: symbol table node for a column */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+ column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+ * sizeof(sel_buf_t));
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = column->prefetch_buf + i;
+
+ sel_buf->data = NULL;
+
+ sel_buf->val_buf_size = 0;
+ }
+}
+
+/*************************************************************************
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = prefetch_buf + i;
+
+ if (sel_buf->val_buf_size > 0) {
+
+ mem_free(sel_buf->data);
+ }
+ }
+}
+
+/*************************************************************************
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+ plan_t* plan) /* in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint val_buf_size;
+
+ ut_ad(plan->n_rows_prefetched > 0);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ val = que_node_get_val(column);
+
+ if (!column->copy_val) {
+ /* We did not really push any value for the
+ column */
+
+ ut_ad(!column->prefetch_buf);
+ ut_ad(que_node_get_val_buf_size(column) == 0);
+#ifdef UNIV_DEBUG
+ dfield_set_data(val, NULL, 0);
+#endif
+ goto next_col;
+ }
+
+ ut_ad(column->prefetch_buf);
+
+ sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+ data = sel_buf->data;
+ len = sel_buf->len;
+ val_buf_size = sel_buf->val_buf_size;
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ sel_buf->data = dfield_get_data(val);
+ sel_buf->len = dfield_get_len(val);
+ sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+ dfield_set_data(val, data, len);
+ que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+
+ plan->n_rows_prefetched--;
+
+ plan->first_prefetched++;
+}
+
+/*************************************************************************
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+ plan_t* plan) /* in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint pos;
+ ulint val_buf_size;
+
+ if (plan->n_rows_prefetched == 0) {
+ pos = 0;
+ plan->first_prefetched = 0;
+ } else {
+ pos = plan->n_rows_prefetched;
+
+ /* We have the convention that pushing new rows starts only
+ after the prefetch stack has been emptied: */
+
+ ut_ad(plan->first_prefetched == 0);
+ }
+
+ plan->n_rows_prefetched++;
+
+ ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ if (!column->copy_val) {
+ /* There is no sense to push pointers to database
+ page fields when we do not keep latch on the page! */
+
+ goto next_col;
+ }
+
+ if (!column->prefetch_buf) {
+ /* Allocate a new prefetch buffer */
+
+ sel_col_prefetch_buf_alloc(column);
+ }
+
+ sel_buf = column->prefetch_buf + pos;
+
+ val = que_node_get_val(column);
+
+ data = dfield_get_data(val);
+ len = dfield_get_len(val);
+ val_buf_size = que_node_get_val_buf_size(column);
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ dfield_set_data(val, sel_buf->data, sel_buf->len);
+ que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+ sel_buf->data = data;
+ sel_buf->len = len;
+ sel_buf->val_buf_size = val_buf_size;
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+ /* out: DB_SUCCESS or error code */
+ read_view_t* read_view, /* in: read view */
+ plan_t* plan, /* in: plan node for table */
+ rec_t* rec, /* in: record in a clustered index */
+ ulint** offsets, /* in/out: offsets returned by
+ rec_get_offsets(rec, plan->index) */
+ mem_heap_t** offset_heap, /* in/out: memory heap from which
+ the offsets are allocated */
+ rec_t** old_vers, /* out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint err;
+
+ if (plan->old_vers_heap) {
+ mem_heap_empty(plan->old_vers_heap);
+ } else {
+ plan->old_vers_heap = mem_heap_create(512);
+ }
+
+ err = row_vers_build_for_consistent_read(rec, mtr, plan->index,
+ offsets, read_view, offset_heap,
+ plan->old_vers_heap, old_vers);
+ return(err);
+}
+
+/*************************************************************************
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted. */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ /* out: TRUE if row passed the tests */
+ plan_t* plan) /* in: plan for the table; the column values must
+ already have been retrieved and the right sides of
+ comparisons evaluated */
+{
+ func_node_t* cond;
+
+ /* All conditions in end_conds are comparisons of a column to an
+ expression */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ /* Evaluate the left side of the comparison, i.e., get the
+ column value if there is an indirection */
+
+ eval_sym(cond->args);
+
+ /* Do the comparison */
+
+ if (!eval_cmp(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Tests the other conditions. */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+ /* out: TRUE if row passed the tests */
+ plan_t* plan) /* in: plan for the table; the column values must
+ already have been retrieved */
+{
+ func_node_t* cond;
+
+ cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+ while (cond) {
+ eval_exp(cond);
+
+ if (!eval_node_get_ibool_val(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+ /* out: DB_SUCCESS or error code */
+ sel_node_t* node, /* in: select_node */
+ plan_t* plan, /* in: plan node for table */
+ rec_t* rec, /* in: record in a non-clustered index */
+ que_thr_t* thr, /* in: query thread */
+ rec_t** out_rec,/* out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ mtr_t* mtr) /* in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ offsets = rec_get_offsets(rec,
+ btr_pcur_get_btr_cur(&plan->pcur)->index,
+ offsets, ULINT_UNDEFINED, &heap);
+
+ row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+ index = dict_table_get_first_index(plan->table);
+
+ btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+ node->latch_mode, &(plan->clust_pcur),
+ 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(&(plan->clust_pcur))
+ < dict_index_get_n_unique(index)) {
+
+ ut_a(rec_get_deleted_flag(rec, plan->table->comp));
+ ut_a(node->read_view);
+
+ /* In a rare case it is possible that no clust rec is found
+ for a delete-marked secondary index record: if in row0umod.c
+ in row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case we know that the
+ clustered index record did not exist in the read view of
+ trx. */
+
+ clust_rec = NULL;
+
+ goto func_exit;
+ }
+
+ offsets = rec_get_offsets(clust_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (!node->read_view) {
+ /* Try to place a lock on the index record */
+
+ /* If innodb_locks_unsafe_for_binlog option is used,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+ ulint lock_type;
+
+ if (srv_locks_unsafe_for_binlog) {
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = lock_clust_rec_read_check_and_lock(0,
+ clust_rec, index, offsets,
+ node->row_lock_mode, lock_type, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(node->read_view, plan,
+ clust_rec, &offsets, &heap,
+ &old_vers, mtr);
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+
+ if (clust_rec == NULL) {
+ goto func_exit;
+ }
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ if ((old_vers || rec_get_deleted_flag(rec, plan->table->comp))
+ && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
+ clust_rec, index)) {
+ clust_rec = NULL;
+ goto func_exit;
+ }
+ }
+
+ /* Fetch the columns needed in test conditions */
+
+ row_sel_fetch_columns(index, clust_rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+func_exit:
+ *out_rec = clust_rec;
+ err = DB_SUCCESS;
+err_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*************************************************************************
+Sets a lock on a record. */
+UNIV_INLINE
+ulint
+sel_set_rec_lock(
+/*=============*/
+ /* out: DB_SUCCESS or error code */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: index */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ ulint mode, /* in: lock mode */
+ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */
+ que_thr_t* thr) /* in: query thread */
+{
+ trx_t* trx;
+ ulint err;
+
+ trx = thr_get_trx(thr);
+
+ if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+ if (buf_LRU_buf_pool_running_out()) {
+
+ return(DB_LOCK_TABLE_FULL);
+ }
+ }
+
+ if (index->type & DICT_CLUSTERED) {
+ err = lock_clust_rec_read_check_and_lock(0,
+ rec, index, offsets, mode, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(0,
+ rec, index, offsets, mode, type, thr);
+ }
+
+ return(err);
+}
+
+/*************************************************************************
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+ sel_node_t* node, /* in: select node */
+ plan_t* plan, /* in: table plan */
+ ibool search_latch_locked,
+ /* in: TRUE if the thread currently
+ has the search latch locked in
+ s-mode */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_index_t* index;
+ func_node_t* cond;
+ que_node_t* exp;
+ ulint n_fields;
+ ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
+ ulint i;
+
+ if (search_latch_locked) {
+ has_search_latch = RW_S_LATCH;
+ }
+
+ index = plan->index;
+
+ /* Calculate the value of the search tuple: the exact match columns
+ get their expressions evaluated when we evaluate the right sides of
+ end_conds */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ eval_exp(que_node_get_next(cond->args));
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+
+ if (plan->n_exact_match < n_fields) {
+ /* There is a non-exact match field which must be
+ evaluated separately */
+
+ eval_exp(plan->tuple_exps[n_fields - 1]);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ exp = plan->tuple_exps[i];
+
+ dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+ que_node_get_val(exp));
+ }
+
+ /* Open pcur to the index */
+
+ btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+ node->latch_mode, &(plan->pcur),
+ has_search_latch, mtr);
+ } else {
+ /* Open the cursor to the start or the end of the index
+ (FALSE: no init) */
+
+ btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
+ &(plan->pcur), FALSE, mtr);
+ }
+
+ ut_ad(plan->n_rows_prefetched == 0);
+ ut_ad(plan->n_rows_fetched == 0);
+ ut_ad(plan->cursor_at_end == FALSE);
+
+ plan->pcur_is_open = TRUE;
+}
+
+/*************************************************************************
+Restores a stored pcur position to a table index. */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+ /* out: TRUE if the cursor should be moved to
+ the next record after we return from this
+ function (moved to the previous, in the case
+ of a descending cursor) without processing
+ again the current cursor record */
+ sel_node_t* node, /* in: select node */
+ plan_t* plan, /* in: table plan */
+ mtr_t* mtr) /* in: mtr */
+{
+ ibool equal_position;
+ ulint relative_position;
+
+ ut_ad(!plan->cursor_at_end);
+
+ relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+ equal_position = btr_pcur_restore_position(node->latch_mode,
+ &(plan->pcur), mtr);
+
+ /* If the cursor is traveling upwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+ yet on the successor of the page infimum;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ not yet processed the cursor record: no need to move the cursor to the
+ next record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we must move to the next record;
+ (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the next
+ record, else there is no need to move the cursor. */
+
+ if (plan->asc) {
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(FALSE);
+ }
+
+ /* If the cursor is traveling downwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+ the last record LESS than the successor of a page infimum; we have not
+ processed the cursor record: no need to move the cursor;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ processed the cursor record: we should move the cursor to the previous
+ record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we need not move to the previous
+ record; (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+ record, else there is no need to move the cursor. */
+
+ if (relative_position == BTR_PCUR_BEFORE
+ || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+ return(FALSE);
+ }
+
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(FALSE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+ plan_t* plan) /* in: plan */
+{
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+ plan->n_rows_fetched = 0;
+ plan->n_rows_prefetched = 0;
+}
+
+/*************************************************************************
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+ /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+ sel_node_t* node, /* in: select node for a consistent read */
+ plan_t* plan, /* in: plan for a unique search in clustered
+ index */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_index_t* index;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ulint ret;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ index = plan->index;
+
+ ut_ad(node->read_view);
+ ut_ad(plan->unique_search);
+ ut_ad(!plan->must_get_clust);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ row_sel_open_pcur(node, plan, TRUE, mtr);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ return(SEL_RETRY);
+ }
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ if (index->type & DICT_CLUSTERED) {
+ if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+ node->read_view)) {
+ ret = SEL_RETRY;
+ goto func_exit;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) {
+
+ ret = SEL_RETRY;
+ goto func_exit;
+ }
+
+ /* Test deleted flag. Fetch the columns needed in test conditions. */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ if (rec_get_deleted_flag(rec, plan->table->comp)) {
+
+ ret = SEL_EXHAUSTED;
+ goto func_exit;
+ }
+
+ /* Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ ret = SEL_EXHAUSTED;
+ goto func_exit;
+ }
+
+ ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+ plan->n_rows_fetched++;
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(SEL_FOUND);
+}
+
+/*************************************************************************
+Performs a select step. */
+static
+ulint
+row_sel(
+/*====*/
+ /* out: DB_SUCCESS or error code */
+ sel_node_t* node, /* in: select node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* index;
+ plan_t* plan;
+ mtr_t mtr;
+ ibool moved;
+ rec_t* rec;
+ rec_t* old_vers;
+ rec_t* clust_rec;
+ ibool search_latch_locked;
+ ibool consistent_read;
+
+ /* The following flag becomes TRUE when we are doing a
+ consistent read from a non-clustered index and we must look
+ at the clustered index to find out the previous delete mark
+ state of the non-clustered record: */
+
+ ibool cons_read_requires_clust_rec = FALSE;
+ ulint cost_counter = 0;
+ ibool cursor_just_opened;
+ ibool must_go_to_next;
+ ibool leaf_contains_updates = FALSE;
+ /* TRUE if select_will_do_update is
+ TRUE and the current clustered index
+ leaf page has been updated during
+ the current mtr: mtr must be committed
+ at the same time as the leaf x-latch
+ is released */
+ ibool mtr_has_extra_clust_latch = FALSE;
+ /* TRUE if the search was made using
+ a non-clustered index, and we had to
+ access the clustered record: now &mtr
+ contains a clustered index latch, and
+ &mtr must be committed before we move
+ to the next non-clustered record */
+ ulint found_flag;
+ ulint err;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(thr->run_node == node);
+
+ search_latch_locked = FALSE;
+
+ if (node->read_view) {
+ /* In consistent reads, we try to do with the hash index and
+ not to use the buffer page get. This is to reduce memory bus
+ load resulting from semaphore operations. The search latch
+ will be s-locked when we access an index with a unique search
+ condition, but not locked when we access an index with a
+ less selective search condition. */
+
+ consistent_read = TRUE;
+ } else {
+ consistent_read = FALSE;
+ }
+
+table_loop:
+ /* TABLE LOOP
+ ----------
+ This is the outer major loop in calculating a join. We come here when
+ node->fetch_table changes, and after adding a row to aggregate totals
+ and, of course, when this function is called. */
+
+ ut_ad(leaf_contains_updates == FALSE);
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ plan = sel_node_get_nth_plan(node, node->fetch_table);
+ index = plan->index;
+
+ if (plan->n_rows_prefetched > 0) {
+ sel_pop_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+ if (plan->cursor_at_end) {
+ /* The cursor has already reached the result set end: no more
+ rows to process for this table cursor, as also the prefetch
+ stack was empty */
+
+ ut_ad(plan->pcur_is_open);
+
+ goto table_exhausted_no_mtr;
+ }
+
+ /* Open a cursor to index, or restore an open cursor position */
+
+ mtr_start(&mtr);
+
+ if (consistent_read && plan->unique_search && !plan->pcur_is_open
+ && !plan->must_get_clust) {
+ if (!search_latch_locked) {
+ rw_lock_s_lock(&btr_search_latch);
+
+ search_latch_locked = TRUE;
+ } else if (btr_search_latch.writer_is_wait_ex) {
+
+ /* There is an x-latch request waiting: release the
+ s-latch for a moment; as an s-latch here is often
+ kept for some 10 searches before being released,
+ a waiting x-latch request would block other threads
+ from acquiring an s-latch for a long time, lowering
+ performance significantly in multiprocessors. */
+
+ rw_lock_s_unlock(&btr_search_latch);
+ rw_lock_s_lock(&btr_search_latch);
+ }
+
+ found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+ if (found_flag == SEL_FOUND) {
+
+ goto next_table;
+
+ } else if (found_flag == SEL_EXHAUSTED) {
+
+ goto table_exhausted;
+ }
+
+ ut_ad(found_flag == SEL_RETRY);
+
+ plan_reset_cursor(plan);
+
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ }
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+
+ search_latch_locked = FALSE;
+ }
+
+ if (!plan->pcur_is_open) {
+ /* Evaluate the expressions to build the search tuple and
+ open the cursor */
+
+ row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
+
+ cursor_just_opened = TRUE;
+
+ /* A new search was made: increment the cost counter */
+ cost_counter++;
+ } else {
+ /* Restore pcur position to the index */
+
+ must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
+
+ cursor_just_opened = FALSE;
+
+ if (must_go_to_next) {
+ /* We have already processed the cursor record: move
+ to the next */
+
+ goto next_rec;
+ }
+ }
+
+rec_loop:
+ /* RECORD LOOP
+ -----------
+ In this loop we use pcur and try to fetch a qualifying row, and
+ also fill the prefetch buffer for this table if n_rows_fetched has
+ exceeded a threshold. While we are inside this loop, the following
+ holds:
+ (1) &mtr is started,
+ (2) pcur is positioned and open.
+
+ NOTE that if cursor_just_opened is TRUE here, it means that we came
+ to this point right after row_sel_open_pcur. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ /* PHASE 1: Set a lock if specified */
+
+ if (!node->asc && cursor_just_opened
+ && (rec != page_get_supremum_rec(buf_frame_align(rec)))) {
+
+ /* When we open a cursor for a descending search, we must set
+ a next-key lock on the successor record: otherwise it would
+ be possible to insert new records next to the cursor position,
+ and it might be that these new records should appear in the
+ search result set, resulting in the phantom problem. */
+
+ if (!consistent_read) {
+
+ /* If innodb_locks_unsafe_for_binlog option is used,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+
+ rec_t* next_rec = page_rec_get_next(rec);
+ ulint lock_type;
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (srv_locks_unsafe_for_binlog) {
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(next_rec, index, offsets,
+ node->row_lock_mode, lock_type, thr);
+
+ if (err != DB_SUCCESS) {
+ /* Note that in this case we will store in pcur
+ the PREDECESSOR of the record we are waiting
+ the lock for */
+
+ goto lock_wait_or_error;
+ }
+ }
+ }
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. We also increment the cost counter as we may have
+ processed yet another page of index. */
+
+ cost_counter++;
+
+ goto next_rec;
+ }
+
+ if (!consistent_read) {
+ /* Try to place a lock on the index record */
+
+ /* If innodb_locks_unsafe_for_binlog option is used,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+
+ ulint lock_type;
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (srv_locks_unsafe_for_binlog) {
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(rec, index, offsets,
+ node->row_lock_mode, lock_type, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ /* A page supremum record cannot be in the result set: skip
+ it now when we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (cost_counter > SEL_COST_LIMIT) {
+
+ /* Now that we have placed the necessary locks, we can stop
+ for a while and store the cursor position; NOTE that if we
+ would store the cursor position BEFORE placing a record lock,
+ it might happen that the cursor would jump over some records
+ that another transaction could meanwhile insert adjacent to
+ the cursor: this would result in the phantom problem. */
+
+ goto stop_for_a_while;
+ }
+
+ /* PHASE 2: Check a mixed index mix id if needed */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search
+ with the mode PAGE_CUR_GE, the up_match field in the cursor
+ tells how many fields in the user record matched to the search
+ tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur))
+ < plan->n_exact_match) {
+ goto table_exhausted;
+ }
+
+ /* Ok, no need to test end_conds or mix id */
+
+ } else if (plan->mixed_index) {
+ /* We have to check if the record in a mixed cluster belongs
+ to this table */
+
+ if (!dict_is_mixed_table_rec(plan->table, rec)) {
+
+ goto next_rec;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ /* PHASE 3: Get previous version in a consistent read */
+
+ cons_read_requires_clust_rec = FALSE;
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ if (consistent_read) {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (index->type & DICT_CLUSTERED) {
+
+ if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(node->read_view,
+ plan, rec,
+ &offsets, &heap,
+ &old_vers, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ row_sel_fetch_columns(index, rec,
+ offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, index,
+ node->read_view)) {
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+
+ /* PHASE 4: Test search end conditions and deleted flag */
+
+ /* Fetch the columns needed in test conditions */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the selection end conditions: these can only contain columns
+ which already are found in the index, even though the index might be
+ non-clustered */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ /* No test necessary: the test was already made above */
+
+ } else if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ if (rec_get_deleted_flag(rec, plan->table->comp)
+ && !cons_read_requires_clust_rec) {
+
+ /* The record is delete marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 5: Get the clustered index record, if needed and if we did
+ not do the search using the clustered index */
+
+ if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+ &mtr);
+ mtr_has_extra_clust_latch = TRUE;
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* Retrieving the clustered record required a search:
+ increment the cost counter */
+
+ cost_counter++;
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(consistent_read);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, plan->table->comp)) {
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ if (node->can_get_updated) {
+
+ btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+ }
+ }
+
+ /* PHASE 6: Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 7: We found a new qualifying row for the current table; push
+ the row if prefetch is on, or move to the next table in the join */
+
+ plan->n_rows_fetched++;
+
+ ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+ if (node->select_will_do_update) {
+ /* This is a searched update and we can do the update in-place,
+ saving CPU time */
+
+ row_upd_in_place_in_select(node, thr, &mtr);
+
+ leaf_contains_updates = TRUE;
+
+ /* When the database is in the online backup mode, the number
+ of log records for a single mtr should be small: increment the
+ cost counter to ensure it */
+
+ cost_counter += 1 + (SEL_COST_LIMIT / 8);
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+ || plan->unique_search || plan->no_prefetch) {
+
+ /* No prefetch in operation: go to the next table */
+
+ goto next_table;
+ }
+
+ sel_push_prefetched_row(plan);
+
+ if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+ /* The prefetch buffer is now full */
+
+ sel_pop_prefetched_row(plan);
+
+ goto next_table;
+ }
+
+next_rec:
+ ut_ad(!search_latch_locked);
+
+ if (mtr_has_extra_clust_latch) {
+
+ /* We must commit &mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (leaf_contains_updates
+ && btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) {
+
+ /* We must commit &mtr if we are moving to a different page,
+ because we have done updates to the x-latched leaf page, and
+ the latch would be released in btr_pcur_move_to_next, without
+ &mtr getting committed there */
+
+ ut_ad(node->asc);
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (node->asc) {
+ moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+ }
+
+ if (!moved) {
+
+ goto table_exhausted;
+ }
+
+ cursor_just_opened = FALSE;
+
+ /* END OF RECORD LOOP
+ ------------------ */
+ goto rec_loop;
+
+next_table:
+ /* We found a record which satisfies the conditions: we can move to
+ the next table or return a row in the result set */
+
+ ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr));
+
+ if (plan->unique_search && !node->can_get_updated) {
+
+ plan->cursor_at_end = TRUE;
+ } else {
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ leaf_contains_updates = FALSE;
+ mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+ /* If we use 'goto' to this label, it means that the row was popped
+ from the prefetched rows stack, and &mtr is already committed */
+
+ if (node->fetch_table + 1 == node->n_tables) {
+
+ sel_eval_select_list(node);
+
+ if (node->is_aggregate) {
+
+ goto table_loop;
+ }
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ node->fetch_table++;
+
+ /* When we move to the next table, we first reset the plan cursor:
+ we do not care about resetting it when we backtrack from a table */
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+ goto table_loop;
+
+table_exhausted:
+ /* The table cursor pcur reached the result set end: backtrack to the
+ previous table in the join if we do not have cached prefetched rows */
+
+ plan->cursor_at_end = TRUE;
+
+ mtr_commit(&mtr);
+
+ leaf_contains_updates = FALSE;
+ mtr_has_extra_clust_latch = FALSE;
+
+ if (plan->n_rows_prefetched > 0) {
+ /* The table became exhausted during a prefetch */
+
+ sel_pop_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+table_exhausted_no_mtr:
+ if (node->fetch_table == 0) {
+ err = DB_SUCCESS;
+
+ if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+ node->aggregate_already_fetched = TRUE;
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ goto func_exit;
+ }
+
+ node->state = SEL_NODE_NO_MORE_ROWS;
+
+ thr->run_node = que_node_get_parent(node);
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ goto func_exit;
+ }
+
+ node->fetch_table--;
+
+ goto table_loop;
+
+stop_for_a_while:
+ /* Return control for a while to que_run_threads, so that runaway
+ queries can be canceled. NOTE that when we come here, we must, in a
+ locking read, have placed the necessary (possibly waiting request)
+ record lock on the cursor record or its successor: when we reposition
+ the cursor, this record lock guarantees that nobody can meanwhile have
+ inserted new records which should have appeared in the result set,
+ which would result in the phantom problem. */
+
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+ err = DB_SUCCESS;
+ goto func_exit;
+
+commit_mtr_for_a_while:
+ /* Stores the cursor position and commits &mtr; this is used if
+ &mtr may contain latches which would break the latching order if
+ &mtr would not be committed and the latches released. */
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ ut_ad(!search_latch_locked);
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ leaf_contains_updates = FALSE;
+ mtr_has_extra_clust_latch = FALSE;
+
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+ goto table_loop;
+
+lock_wait_or_error:
+ /* See the note at stop_for_a_while: the same holds for this case */
+
+ ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr)
+ || !node->asc);
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/**************************************************************************
+Performs a select step. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_sel_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint i_lock_mode;
+ sym_node_t* table_node;
+ sel_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+ /* If this is a new time this node is executed (or when execution
+ resumes after wait for a table intention lock), set intention locks
+ on the tables, or assign a read view */
+
+ if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+ node->state = SEL_NODE_OPEN;
+ }
+
+ if (node->state == SEL_NODE_OPEN) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(thr_get_trx(thr));
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+ if (node->consistent_read) {
+ /* Assign a read view for the query */
+ node->read_view = trx_assign_read_view(
+ thr_get_trx(thr));
+ } else {
+ if (node->set_x_locks) {
+ i_lock_mode = LOCK_IX;
+ } else {
+ i_lock_mode = LOCK_IS;
+ }
+
+ table_node = node->table_list;
+
+ while (table_node) {
+ err = lock_table(0, table_node->table,
+ i_lock_mode, thr);
+ if (err != DB_SUCCESS) {
+
+ que_thr_handle_error(thr, DB_ERROR,
+ NULL, 0);
+ return(NULL);
+ }
+
+ table_node = que_node_get_next(table_node);
+ }
+ }
+
+ /* If this is an explicit cursor, copy stored procedure
+ variable values, so that the values cannot change between
+ fetches (currently, we copy them also for non-explicit
+ cursors) */
+
+ if (node->explicit_cursor &&
+ UT_LIST_GET_FIRST(node->copy_variables)) {
+
+ row_sel_copy_input_variable_vals(node);
+ }
+
+ node->state = SEL_NODE_FETCH;
+ node->fetch_table = 0;
+
+ if (node->is_aggregate) {
+ /* Reset the aggregate total values */
+ sel_reset_aggregate_vals(node);
+ }
+ }
+
+ err = row_sel(node, thr);
+
+ /* NOTE! if queries are parallelized, the following assignment may
+ have problems; the assignment should be made only if thr is the
+ only top-level thr in the graph: */
+
+ thr->graph->last_sel_node = node;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+ fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+ que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+ return(NULL);
+ }
+
+ return(thr);
+}
+
+/**************************************************************************
+Performs a fetch for a cursor. */
+
+que_thr_t*
+fetch_step(
+/*=======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ sel_node_t* sel_node;
+ fetch_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ sel_node = node->cursor_def;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+ if (thr->prev_node != que_node_get_parent(node)) {
+
+ if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+ sel_assign_into_var_values(node->into_list, sel_node);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ /* Make the fetch node the parent of the cursor definition for
+ the time of the fetch, so that execution knows to return to this
+ fetch node after a row has been selected or we know that there is
+ no row left */
+
+ sel_node->common.parent = node;
+
+ if (sel_node->state == SEL_NODE_CLOSED) {
+ /* SQL error detected */
+ fprintf(stderr, "SQL error %lu\n", (ulong)DB_ERROR);
+
+ que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+ return(NULL);
+ }
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/***************************************************************
+Prints a row in a select result. */
+
+que_thr_t*
+row_printf_step(
+/*============*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ row_printf_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* arg;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ sel_node = node->sel_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+
+ if (sel_node->state != SEL_NODE_FETCH) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to print */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ arg = sel_node->select_list;
+
+ while (arg) {
+ dfield_print_also_hex(que_node_get_val(arg));
+
+ fputs(" ::: ", stderr);
+
+ arg = que_node_get_next(arg);
+ }
+
+ putc('\n', stderr);
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/********************************************************************
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /* in: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /* in: buffer to use in field
+ conversions */
+ ulint buf_len, /* in: buffer length */
+ dict_index_t* index, /* in: index of the key value */
+ byte* key_ptr, /* in: MySQL key value */
+ ulint key_len, /* in: MySQL key value length */
+ trx_t* trx) /* in: transaction */
+{
+ byte* original_buf = buf;
+ byte* original_key_ptr = key_ptr;
+ dict_field_t* field;
+ dfield_t* dfield;
+ ulint data_offset;
+ ulint data_len;
+ ulint data_field_len;
+ ibool is_null;
+ byte* key_end;
+ ulint n_fields = 0;
+ ulint type;
+
+ /* For documentation of the key value storage format in MySQL, see
+ ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+ key_end = key_ptr + key_len;
+
+ /* Permit us to access any field in the tuple (ULINT_MAX): */
+
+ dtuple_set_n_fields(tuple, ULINT_MAX);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+ field = dict_index_get_nth_field(index, 0);
+
+ if (dfield_get_type(dfield)->mtype == DATA_SYS) {
+ /* A special case: we are looking for a position in the
+ generated clustered index which InnoDB automatically added
+ to a table with no primary key: the first and the only
+ ordering column is ROW_ID which InnoDB stored to the key_ptr
+ buffer. */
+
+ ut_a(key_len == DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+ dtuple_set_n_fields(tuple, 1);
+
+ return;
+ }
+
+ while (key_ptr < key_end) {
+
+ ut_a(dict_col_get_type(field->col)->mtype
+ == dfield_get_type(dfield)->mtype);
+
+ data_offset = 0;
+ is_null = FALSE;
+
+ if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+ /* The first byte in the field tells if this is
+ an SQL NULL value */
+
+ data_offset = 1;
+
+ if (*key_ptr != 0) {
+ dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+ is_null = TRUE;
+ }
+ }
+
+ type = dfield_get_type(dfield)->mtype;
+
+ /* Calculate data length and data field total length */
+
+ if (type == DATA_BLOB) {
+ /* The key field is a column prefix of a BLOB or
+ TEXT */
+
+ ut_a(field->prefix_len > 0);
+
+ /* MySQL stores the actual data length to the first 2
+ bytes after the optional SQL NULL marker byte. The
+ storage format is little-endian, that is, the most
+ significant byte at a higher address. In UTF-8, MySQL
+ seems to reserve field->prefix_len bytes for
+ storing this field in the key value buffer, even
+ though the actual value only takes data_len bytes
+ from the start. */
+
+ data_len = key_ptr[data_offset]
+ + 256 * key_ptr[data_offset + 1];
+ data_field_len = data_offset + 2 + field->prefix_len;
+
+ data_offset += 2;
+
+ /* Now that we know the length, we store the column
+ value like it would be a fixed char field */
+
+ } else if (field->prefix_len > 0) {
+ /* Looks like MySQL pads unused end bytes in the
+ prefix with space. Therefore, also in UTF-8, it is ok
+ to compare with a prefix containing full prefix_len
+ bytes, and no need to take at most prefix_len / 3
+ UTF-8 characters from the start.
+ If the prefix is used as the upper end of a LIKE
+ 'abc%' query, then MySQL pads the end with chars
+ 0xff. TODO: in that case does it any harm to compare
+ with the full prefix_len bytes. How do characters
+ 0xff in UTF-8 behave? */
+
+ data_len = field->prefix_len;
+ data_field_len = data_offset + data_len;
+ } else {
+ data_len = dfield_get_type(dfield)->len;
+ data_field_len = data_offset + data_len;
+ }
+
+ if (dtype_get_mysql_type(dfield_get_type(dfield))
+ == DATA_MYSQL_TRUE_VARCHAR
+ && dfield_get_type(dfield)->mtype != DATA_INT) {
+ /* In a MySQL key value format, a true VARCHAR is
+ always preceded by 2 bytes of a length field.
+ dfield_get_type(dfield)->len returns the maximum
+ 'payload' len in bytes. That does not include the
+ 2 bytes that tell the actual data length.
+
+ We added the check != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR! */
+
+ data_len += 2;
+ data_field_len += 2;
+ }
+
+ /* Storing may use at most data_len bytes of buf */
+
+ if (!is_null) {
+ row_mysql_store_col_in_innobase_format(
+ dfield,
+ buf,
+ FALSE, /* MySQL key value format col */
+ key_ptr + data_offset,
+ data_len,
+ index->table->comp);
+ buf += data_len;
+ }
+
+ key_ptr += data_field_len;
+
+ if (key_ptr > key_end) {
+ /* The last field in key was not a complete key field
+ but a prefix of it.
+
+ Print a warning about this! HA_READ_PREFIX_LAST does
+ not currently work in InnoDB with partial-field key
+ value prefixes. Since MySQL currently uses a padding
+ trick to calculate LIKE 'abc%' type queries there
+ should never be partial-field prefixes in searches. */
+
+ ut_print_timestamp(stderr);
+
+ fputs(
+ " InnoDB: Warning: using a partial-field key prefix in search.\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, ". Last data field length %lu bytes,\n"
+ "InnoDB: key ptr now exceeds key end by %lu bytes.\n"
+ "InnoDB: Key value in the MySQL format:\n",
+ (ulong) data_field_len,
+ (ulong) (key_ptr - key_end));
+ fflush(stderr);
+ ut_print_buf(stderr, original_key_ptr, key_len);
+ fprintf(stderr, "\n");
+
+ if (!is_null) {
+ dfield->len -= (ulint)(key_ptr - key_end);
+ }
+ }
+
+ n_fields++;
+ field++;
+ dfield++;
+ }
+
+ ut_a(buf <= original_buf + buf_len);
+
+ /* We set the length of tuple to n_fields: we assume that the memory
+ area allocated for it is big enough (usually bigger than n_fields). */
+
+ dtuple_set_n_fields(tuple, n_fields);
+}
+
+/******************************************************************
+Stores the row id to the prebuilt struct. */
+static
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+ row_prebuilt_t* prebuilt, /* in: prebuilt */
+ rec_t* index_rec, /* in: record */
+ dict_index_t* index, /* in: index of the record */
+ const ulint* offsets) /* in: rec_get_offsets
+ (index_rec, index) */
+{
+ byte* data;
+ ulint len;
+
+ ut_ad(rec_offs_validate(index_rec, index, offsets));
+
+ data = rec_get_nth_field(index_rec, offsets,
+ dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+ if (len != DATA_ROW_ID_LEN) {
+ fprintf(stderr,
+"InnoDB: Error: Row id field is wrong length %lu in ", (ulong) len);
+ dict_index_name_print(stderr, prebuilt->trx, index);
+ fprintf(stderr, "\n"
+"InnoDB: Field number %lu, record:\n",
+ (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID));
+ rec_print_new(stderr, index_rec, offsets);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
+static
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+ byte* dest, /* in/out: buffer where to store; NOTE that BLOBs
+ are not in themselves stored here: the caller must
+ allocate and copy the BLOB into buffer before, and pass
+ the pointer to the BLOB in 'data' */
+ const mysql_row_templ_t* templ, /* in: MySQL column template.
+ Its following fields are referenced:
+ type, is_unsigned, mysql_col_len, mbminlen, mbmaxlen */
+ byte* data, /* in: data to store */
+ ulint len) /* in: length of the data */
+{
+ byte* ptr;
+ byte* field_end;
+ byte* pad_ptr;
+
+ ut_ad(len != UNIV_SQL_NULL);
+
+ if (templ->type == DATA_INT) {
+ /* Convert integer data from Innobase to a little-endian
+ format, sign bit restored to normal */
+
+ ptr = dest + len;
+
+ for (;;) {
+ ptr--;
+ *ptr = *data;
+ if (ptr == dest) {
+ break;
+ }
+ data++;
+ }
+
+ if (!templ->is_unsigned) {
+ dest[len - 1] = (byte) (dest[len - 1] ^ 128);
+ }
+
+ ut_ad(templ->mysql_col_len == len);
+ } else if (templ->type == DATA_VARCHAR
+ || templ->type == DATA_VARMYSQL
+ || templ->type == DATA_BINARY) {
+
+ field_end = dest + templ->mysql_col_len;
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR. Store the
+ length of the data to the first byte or the first
+ two bytes of dest. */
+
+ dest = row_mysql_store_true_var_len(dest, len,
+ templ->mysql_length_bytes);
+ }
+
+ /* Copy the actual data */
+ ut_memcpy(dest, data, len);
+
+ /* Pad with trailing spaces. We pad with spaces also the
+ unused end of a >= 5.0.3 true VARCHAR column, just in case
+ MySQL expects its contents to be deterministic. */
+
+ pad_ptr = dest + len;
+
+ ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+ /* We handle UCS2 charset strings differently. */
+ if (templ->mbminlen == 2) {
+ /* A space char is two bytes, 0x0020 in UCS2 */
+
+ if (len & 1) {
+ /* A 0x20 has been stripped from the column.
+ Pad it back. */
+
+ if (pad_ptr < field_end) {
+ *pad_ptr = 0x20;
+ pad_ptr++;
+ }
+ }
+
+ /* Pad the rest of the string with 0x0020 */
+
+ while (pad_ptr < field_end) {
+ *pad_ptr = 0x00;
+ pad_ptr++;
+ *pad_ptr = 0x20;
+ pad_ptr++;
+ }
+ } else {
+ ut_ad(templ->mbminlen == 1);
+ /* space=0x20 */
+
+ memset(pad_ptr, 0x20, field_end - pad_ptr);
+ }
+ } else if (templ->type == DATA_BLOB) {
+ /* Store a pointer to the BLOB buffer to dest: the BLOB was
+ already copied to the buffer in row_sel_store_mysql_rec */
+
+ row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+ len);
+ } else if (templ->type == DATA_MYSQL) {
+ memcpy(dest, data, len);
+
+ ut_a(templ->mysql_col_len >= len);
+ ut_a(templ->mbmaxlen >= templ->mbminlen);
+
+ ut_a(templ->mbmaxlen > templ->mbminlen
+ || templ->mysql_col_len == len);
+ /* The following assertion would fail for old tables
+ containing UTF-8 ENUM columns due to Bug #9526. */
+ ut_ad(!templ->mbmaxlen
+ || !(templ->mysql_col_len % templ->mbmaxlen));
+ ut_a(len * templ->mbmaxlen >= templ->mysql_col_len);
+
+ if (templ->mbminlen != templ->mbmaxlen) {
+ /* Pad with spaces. This undoes the stripping
+ done in row0mysql.ic, function
+ row_mysql_store_col_in_innobase_format(). */
+
+ memset(dest + len, 0x20, templ->mysql_col_len - len);
+ }
+ } else {
+ ut_a(templ->type == DATA_CHAR
+ || templ->type == DATA_FIXBINARY
+ /*|| templ->type == DATA_SYS_CHILD
+ || templ->type == DATA_SYS*/
+ || templ->type == DATA_FLOAT
+ || templ->type == DATA_DOUBLE
+ || templ->type == DATA_DECIMAL);
+ ut_ad(templ->mysql_col_len == len);
+
+ memcpy(dest, data, len);
+ }
+}
+
+/******************************************************************
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query. */
+static
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+ /* out: TRUE if success, FALSE if
+ could not allocate memory for a BLOB
+ (though we may also assert in that
+ case) */
+ byte* mysql_rec, /* out: row in the MySQL format */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct */
+ rec_t* rec, /* in: Innobase record in the index
+ which was described in prebuilt's
+ template */
+ const ulint* offsets) /* in: array returned by
+ rec_get_offsets() */
+{
+ mysql_row_templ_t* templ;
+ mem_heap_t* extern_field_heap = NULL;
+ byte* data;
+ ulint len;
+ byte* blob_buf;
+ int pad_char;
+ ulint i;
+
+ ut_ad(prebuilt->mysql_template);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (prebuilt->blob_heap != NULL) {
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+ }
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+
+ data = rec_get_nth_field(rec, offsets,
+ templ->rec_field_no, &len);
+
+ if (rec_offs_nth_extern(offsets, templ->rec_field_no)) {
+
+ /* Copy an externally stored field to the temporary
+ heap */
+
+ ut_a(!prebuilt->trx->has_search_latch);
+
+ extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE);
+
+ /* NOTE: if we are retrieving a big BLOB, we may
+ already run out of memory in the next call, which
+ causes an assert */
+
+ data = btr_rec_copy_externally_stored_field(rec,
+ offsets, templ->rec_field_no, &len,
+ extern_field_heap);
+
+ ut_a(len != UNIV_SQL_NULL);
+ }
+
+ if (len != UNIV_SQL_NULL) {
+ if (templ->type == DATA_BLOB) {
+
+ ut_a(prebuilt->templ_contains_blob);
+
+ /* A heuristic test that we can allocate the
+ memory for a big BLOB. We have a safety margin
+ of 1000000 bytes. Since the test takes some
+ CPU time, we do not use it for small BLOBs. */
+
+ if (len > 2000000
+ && !ut_test_malloc(len + 1000000)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: could not allocate %lu + 1000000 bytes to retrieve\n"
+"InnoDB: a big column. Table name ", (ulong) len);
+ ut_print_name(stderr,
+ prebuilt->trx,
+ prebuilt->table->name);
+ putc('\n', stderr);
+
+ if (extern_field_heap) {
+ mem_heap_free(
+ extern_field_heap);
+ }
+ return(FALSE);
+ }
+
+ /* Copy the BLOB data to the BLOB heap of
+ prebuilt */
+
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap =
+ mem_heap_create(len);
+ }
+
+ blob_buf = mem_heap_alloc(prebuilt->blob_heap,
+ len);
+ ut_memcpy(blob_buf, data, len);
+
+ data = blob_buf;
+ }
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, data, len);
+
+ /* Cleanup */
+ if (extern_field_heap) {
+ mem_heap_free(extern_field_heap);
+ extern_field_heap = NULL;
+ }
+
+ if (templ->mysql_null_bit_mask) {
+ /* It is a nullable column with a non-NULL
+ value */
+ mysql_rec[templ->mysql_null_byte_offset] &=
+ ~(byte) (templ->mysql_null_bit_mask);
+ }
+ } else {
+ /* MySQL seems to assume the field for an SQL NULL
+ value is set to zero or space. Not taking this into
+ account caused seg faults with NULL BLOB fields, and
+ bug number 154 in the MySQL bug database: GROUP BY
+ and DISTINCT could treat NULL values inequal. */
+
+ mysql_rec[templ->mysql_null_byte_offset] |=
+ (byte) (templ->mysql_null_bit_mask);
+ if (templ->type == DATA_VARCHAR
+ || templ->type == DATA_CHAR
+ || templ->type == DATA_BINARY
+ || templ->type == DATA_FIXBINARY
+ || templ->type == DATA_MYSQL
+ || templ->type == DATA_VARMYSQL) {
+ /* MySQL pads all non-BLOB and non-TEXT
+ string types with space ' ' */
+
+ pad_char = ' ';
+ } else {
+ pad_char = '\0';
+ }
+
+ /* Handle UCS2 strings differently. */
+ if (pad_char != '\0' && templ->mbminlen == 2) {
+ /* There are two bytes per char, so the length
+ has to be an even number. */
+ ut_a(!(templ->mysql_col_len & 1));
+ data = mysql_rec + templ->mysql_col_offset;
+ len = templ->mysql_col_len;
+ /* Pad with 0x0020. */
+ while (len >= 2) {
+ *data++ = 0x00;
+ *data++ = 0x20;
+ len -= 2;
+ }
+ } else {
+ ut_ad(!pad_char || templ->mbminlen == 1);
+ memset(mysql_rec + templ->mysql_col_offset,
+ pad_char, templ->mysql_col_len);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+ /* out: DB_SUCCESS or error code */
+ read_view_t* read_view, /* in: read view */
+ dict_index_t* clust_index, /* in: clustered index */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct */
+ rec_t* rec, /* in: record in a clustered index */
+ ulint** offsets, /* in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /* in/out: memory heap from which
+ the offsets are allocated */
+ rec_t** old_vers, /* out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint err;
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(200);
+ }
+
+ err = row_vers_build_for_consistent_read(rec, mtr, clust_index,
+ offsets, read_view, offset_heap,
+ prebuilt->old_vers_heap, old_vers);
+ return(err);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface. */
+static
+ulint
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+ /* out: DB_SUCCESS or error code */
+ row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */
+ dict_index_t* sec_index,/* in: secondary index where rec resides */
+ rec_t* rec, /* in: record in a non-clustered index; if
+ this is a locking read, then rec is not
+ allowed to be delete-marked, and that would
+ not make sense either */
+ que_thr_t* thr, /* in: query thread */
+ rec_t** out_rec,/* out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ ulint** offsets,/* out: offsets returned by
+ rec_get_offsets(out_rec, clust_index) */
+ mem_heap_t** offset_heap,/* in/out: memory heap from which
+ the offsets are allocated */
+ mtr_t* mtr) /* in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* clust_index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err;
+ trx_t* trx;
+
+ *out_rec = NULL;
+ trx = thr_get_trx(thr);
+
+ row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec, trx);
+
+ clust_index = dict_table_get_first_index(sec_index->table);
+
+ btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ prebuilt->clust_pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+ prebuilt->clust_pcur->trx_if_known = trx;
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(prebuilt->clust_pcur)
+ < dict_index_get_n_unique(clust_index)) {
+
+ /* In a rare case it is possible that no clust rec is found
+ for a delete-marked secondary index record: if in row0umod.c
+ in row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case we know that the
+ clustered index record did not exist in the read view of
+ trx. */
+
+ if (!rec_get_deleted_flag(rec, sec_index->table->comp)
+ || prebuilt->select_lock_type != LOCK_NONE) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: error clustered record"
+ " for sec rec not found\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, sec_index);
+ fputs("\n"
+ "InnoDB: sec index record ", stderr);
+ rec_print(stderr, rec, sec_index);
+ fputs("\n"
+ "InnoDB: clust index record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ putc('\n', stderr);
+ trx_print(stderr, trx);
+
+ fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+ }
+
+ clust_rec = NULL;
+
+ goto func_exit;
+ }
+
+ *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+ ULINT_UNDEFINED, offset_heap);
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; we are searching
+ the clust rec with a unique condition, hence
+ we set a LOCK_REC_NOT_GAP type lock */
+
+ err = lock_clust_rec_read_check_and_lock(0, clust_rec,
+ clust_index, *offsets,
+ prebuilt->select_lock_type,
+ LOCK_REC_NOT_GAP, thr);
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ /* If the isolation level allows reading of uncommitted data,
+ then we never look for an earlier version */
+
+ if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && !lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+ *offsets, trx->read_view)) {
+
+ err = row_sel_build_prev_vers_for_mysql(
+ trx->read_view, clust_index,
+ prebuilt, clust_rec,
+ offsets, offset_heap,
+ &old_vers, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ if (clust_rec && (old_vers
+ || rec_get_deleted_flag(rec, sec_index->table->comp))
+ && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+ clust_rec, clust_index)) {
+ clust_rec = NULL;
+ } else {
+#ifdef UNIV_SEARCH_DEBUG
+ ut_a(clust_rec == NULL ||
+ row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+ clust_rec, clust_index));
+#endif
+ }
+ }
+
+func_exit:
+ *out_rec = clust_rec;
+
+ if (prebuilt->select_lock_type == LOCK_X) {
+ /* We may use the cursor in update: store its position */
+
+ btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+ }
+
+ err = DB_SUCCESS;
+err_exit:
+ return(err);
+}
+
+/************************************************************************
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down. */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+ /* out: TRUE if we may need to
+ process the record the cursor is
+ now positioned on (i.e. we should
+ not go to the next record yet) */
+ ulint latch_mode, /* in: latch mode wished in
+ restoration */
+ btr_pcur_t* pcur, /* in: cursor whose position
+ has been stored */
+ ibool moves_up, /* in: TRUE if the cursor moves up
+ in the index */
+ mtr_t* mtr) /* in: mtr; CAUTION: may commit
+ mtr temporarily! */
+{
+ ibool success;
+ ulint relative_position;
+
+ relative_position = pcur->rel_pos;
+
+ success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+ if (relative_position == BTR_PCUR_ON) {
+ if (success) {
+ return(FALSE);
+ }
+
+ if (moves_up) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return(TRUE);
+ }
+
+ if (relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
+
+ if (moves_up) {
+ return(TRUE);
+ }
+
+ if (btr_pcur_is_on_user_rec(pcur, mtr)) {
+ btr_pcur_move_to_prev(pcur, mtr);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_BEFORE
+ || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
+
+ if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return(TRUE);
+}
+
+/************************************************************************
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+ byte* buf, /* in/out: buffer where to copy the
+ row */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct */
+{
+ ulint i;
+ mysql_row_templ_t* templ;
+ byte* cached_rec;
+ ut_ad(prebuilt->n_fetch_cached > 0);
+
+ if (prebuilt->keep_other_fields_on_keyread)
+ {
+ /* Copy cache record field by field, don't touch fields that
+ are not covered by current key */
+ cached_rec =
+ prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+ templ = prebuilt->mysql_template + i;
+ ut_memcpy(
+ buf + templ->mysql_col_offset,
+ cached_rec + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ /* Copy NULL bit of the current field from cached_rec
+ to buf */
+ if (templ->mysql_null_bit_mask)
+ {
+ buf[templ->mysql_null_byte_offset] ^=
+ (buf[templ->mysql_null_byte_offset] ^
+ cached_rec[templ->mysql_null_byte_offset]) &
+ (byte)templ->mysql_null_bit_mask;
+ }
+ }
+ }
+ else
+ {
+ ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+ prebuilt->mysql_row_len);
+ }
+ prebuilt->n_fetch_cached--;
+ prebuilt->fetch_cache_first++;
+
+ if (prebuilt->n_fetch_cached == 0) {
+ prebuilt->fetch_cache_first = 0;
+ }
+}
+
+/************************************************************************
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct */
+ rec_t* rec, /* in: record to push */
+ const ulint* offsets) /* in: rec_get_offsets() */
+{
+ byte* buf;
+ ulint i;
+
+ ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_a(!prebuilt->templ_contains_blob);
+
+ if (prebuilt->fetch_cache[0] == NULL) {
+ /* Allocate memory for the fetch cache */
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+
+ /* A user has reported memory corruption in these
+ buffers in Linux. Put magic numbers there to help
+ to track a possible bug. */
+
+ buf = mem_alloc(prebuilt->mysql_row_len + 8);
+
+ prebuilt->fetch_cache[i] = buf + 4;
+
+ mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
+ mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
+ ROW_PREBUILT_FETCH_MAGIC_N);
+ }
+ }
+
+ ut_ad(prebuilt->fetch_cache_first == 0);
+
+ if (!row_sel_store_mysql_rec(
+ prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+ prebuilt, rec, offsets)) {
+ ut_error;
+ }
+
+ prebuilt->n_fetch_cached++;
+}
+
+/*************************************************************************
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode. */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+ /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+ rec_t** out_rec,/* out: record if found */
+ row_prebuilt_t* prebuilt,/* in: prebuilt struct */
+ ulint** offsets,/* in/out: for rec_get_offsets(*out_rec) */
+ mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */
+ mtr_t* mtr) /* in: started mtr */
+{
+ dict_index_t* index = prebuilt->index;
+ dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ rec_t* rec;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(!prebuilt->templ_contains_blob);
+
+ btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur,
+#ifndef UNIV_SEARCH_DEBUG
+ RW_S_LATCH,
+#else
+ 0,
+#endif
+ mtr);
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ return(SEL_RETRY);
+ }
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ *offsets = rec_get_offsets(rec, index, *offsets,
+ ULINT_UNDEFINED, heap);
+
+ if (!lock_clust_rec_cons_read_sees(rec, index,
+ *offsets, trx->read_view)) {
+
+ return(SEL_RETRY);
+ }
+
+ if (rec_get_deleted_flag(rec, index->table->comp)) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ *out_rec = rec;
+
+ return(SEL_FOUND);
+}
+
+/************************************************************************
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor! */
+
+ulint
+row_search_for_mysql(
+/*=================*/
+ /* out: DB_SUCCESS,
+ DB_RECORD_NOT_FOUND,
+ DB_END_OF_INDEX, DB_DEADLOCK,
+ DB_LOCK_TABLE_FULL, DB_CORRUPTION,
+ or DB_TOO_BIG_RECORD */
+ byte* buf, /* in/out: buffer for the fetched
+ row in the MySQL format */
+ ulint mode, /* in: search mode PAGE_CUR_L, ... */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint match_mode, /* in: 0 or ROW_SEL_EXACT or
+ ROW_SEL_EXACT_PREFIX */
+ ulint direction) /* in: 0 or ROW_SEL_NEXT or
+ ROW_SEL_PREV; NOTE: if this is != 0,
+ then prebuilt must have a pcur
+ with stored position! In opening of a
+ cursor 'direction' should be 0. */
+{
+ dict_index_t* index = prebuilt->index;
+ dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ dict_index_t* clust_index;
+ que_thr_t* thr;
+ rec_t* rec;
+ rec_t* index_rec;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err = DB_SUCCESS;
+ ibool moved;
+ ibool cons_read_requires_clust_rec;
+ ibool was_lock_wait;
+ ulint shortcut;
+ ibool unique_search = FALSE;
+ ibool unique_search_from_clust_index = FALSE;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ ibool moves_up = FALSE;
+ ibool set_also_gap_locks = TRUE;
+ /* if the query is a plain
+ locking SELECT, and the isolation
+ level is <= TRX_ISO_READ_COMMITTED,
+ then this is set to FALSE */
+ ibool success;
+ ibool comp;
+ ulint cnt = 0;
+ ulint next_offs;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(index && pcur && search_tuple);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (prebuilt->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n"
+"InnoDB: table %s does not exist.\n"
+"InnoDB: Have you deleted the .ibd file from the database directory under\n"
+"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n"
+"InnoDB: Look from\n"
+"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n"
+"InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ return(DB_ERROR);
+ }
+
+ if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name ",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, trx, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption((byte*)prebuilt);
+
+ ut_error;
+ }
+
+ if (trx->n_mysql_tables_in_use == 0
+ && prebuilt->select_lock_type == LOCK_NONE) {
+ /* Note that if MySQL uses an InnoDB temp table that it
+ created inside LOCK TABLES, then n_mysql_tables_in_use can
+ be zero; in that case select_lock_type is set to LOCK_X in
+ ::start_stmt. */
+
+ fputs(
+"InnoDB: Error: MySQL is trying to perform a SELECT\n"
+"InnoDB: but it has not locked any tables in ::external_lock()!\n",
+ stderr);
+ trx_print(stderr, trx);
+ fputc('\n', stderr);
+ }
+
+/* fprintf(stderr, "Match mode %lu\n search tuple ", (ulong) match_mode);
+ dtuple_print(search_tuple);
+
+ fprintf(stderr, "N tables locked %lu\n", trx->mysql_n_tables_locked);
+*/
+ /*-------------------------------------------------------------*/
+ /* PHASE 0: Release a possible s-latch we are holding on the
+ adaptive hash index latch if there is someone waiting behind */
+
+ if (trx->has_search_latch
+ && btr_search_latch.writer != RW_LOCK_NOT_LOCKED) {
+
+ /* There is an x-latch request on the adaptive hash index:
+ release the s-latch to reduce starvation and wait for
+ BTR_SEA_TIMEOUT rounds before trying to keep it again over
+ calls from MySQL */
+
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+
+ trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 1: Try to pop the row from the prefetch cache */
+
+ if (direction == 0) {
+ trx->op_info = "starting index read";
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+ } else {
+ trx->op_info = "fetching rows";
+
+ if (prebuilt->n_rows_fetched == 0) {
+ prebuilt->fetch_direction = direction;
+ }
+
+ if (direction != prebuilt->fetch_direction) {
+ if (prebuilt->n_fetch_cached > 0) {
+ ut_error;
+ /* TODO: scrollable cursor: restore cursor to
+ the place of the latest returned row,
+ or better: prevent caching for a scroll
+ cursor! */
+ }
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ } else if (prebuilt->n_fetch_cached > 0) {
+ row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+ prebuilt->n_rows_fetched++;
+
+ srv_n_rows_read++;
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ if (prebuilt->fetch_cache_first > 0
+ && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+ /* The previous returned row was popped from the fetch
+ cache, but the cache was not full at the time of the
+ popping: no more rows can exist in the result set */
+
+ err = DB_RECORD_NOT_FOUND;
+ goto func_exit;
+ }
+
+ prebuilt->n_rows_fetched++;
+
+ if (prebuilt->n_rows_fetched > 1000000000) {
+ /* Prevent wrap-over */
+ prebuilt->n_rows_fetched = 500000000;
+ }
+
+ mode = pcur->search_mode;
+ }
+
+ /* In a search where at most one record in the index may match, we
+ can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete-
+ marked matching record.
+
+ Note that in a unique secondary index there may be different delete-
+ marked versions of a record where only the primary key values differ:
+ thus in a secondary index we must use next-key locks when locking
+ delete-marked records. */
+
+ if (match_mode == ROW_SEL_EXACT
+ && index->type & DICT_UNIQUE
+ && dtuple_get_n_fields(search_tuple)
+ == dict_index_get_n_unique(index)
+ && (index->type & DICT_CLUSTERED
+ || !dtuple_contains_null(search_tuple))) {
+
+ /* Note above that a UNIQUE secondary index can contain many
+ rows with the same key value if one of the columns is the SQL
+ null. A clustered index under MySQL can never contain null
+ columns because we demand that all the columns in primary key
+ are non-null. */
+
+ unique_search = TRUE;
+
+ /* Even if the condition is unique, MySQL seems to try to
+ retrieve also a second row if a primary key contains more than
+ 1 column. Return immediately if this is not a HANDLER
+ command. */
+
+ if (direction != 0 && !prebuilt->used_in_HANDLER) {
+
+ err = DB_RECORD_NOT_FOUND;
+ goto func_exit;
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 2: Try fast adaptive hash index search if possible */
+
+ /* Next test if this is the special case where we can use the fast
+ adaptive hash index to try the search. Since we must release the
+ search system latch when we retrieve an externally stored field, we
+ cannot use the adaptive hash index in a search in the case the row
+ may be long and there may be externally stored fields */
+
+ if (unique_search
+ && index->type & DICT_CLUSTERED
+ && direction == 0
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->used_in_HANDLER
+ && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
+
+ mode = PAGE_CUR_GE;
+
+ unique_search_from_clust_index = TRUE;
+
+ if (trx->mysql_n_tables_locked == 0
+ && prebuilt->select_lock_type == LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && trx->read_view) {
+
+ /* This is a SELECT query done as a consistent read,
+ and the read view has already been allocated:
+ let us try a search shortcut through the hash
+ index.
+ NOTE that we must also test that
+ mysql_n_tables_locked == 0, because this might
+ also be INSERT INTO ... SELECT ... or
+ CREATE TABLE ... SELECT ... . Our algorithm is
+ NOT prepared to inserts interleaved with the SELECT,
+ and if we try that, we can deadlock on the adaptive
+ hash index semaphore! */
+
+#ifndef UNIV_SEARCH_DEBUG
+ if (!trx->has_search_latch) {
+ rw_lock_s_lock(&btr_search_latch);
+ trx->has_search_latch = TRUE;
+ }
+#endif
+ shortcut = row_sel_try_search_shortcut_for_mysql(&rec,
+ prebuilt, &offsets, &heap, &mtr);
+ if (shortcut == SEL_FOUND) {
+#ifdef UNIV_SEARCH_DEBUG
+ ut_a(0 == cmp_dtuple_rec(search_tuple,
+ rec, offsets));
+#endif
+ if (!row_sel_store_mysql_rec(buf, prebuilt,
+ rec, offsets)) {
+ err = DB_TOO_BIG_RECORD;
+
+ /* We let the main loop to do the
+ error handling */
+ goto shortcut_fails_too_big_rec;
+ }
+
+ mtr_commit(&mtr);
+
+ /* ut_print_name(stderr, index->name);
+ fputs(" shortcut\n", stderr); */
+
+ srv_n_rows_read++;
+
+ if (trx->search_latch_timeout > 0
+ && trx->has_search_latch) {
+
+ trx->search_latch_timeout--;
+
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+ }
+
+ /* NOTE that we do NOT store the cursor
+ position */
+ err = DB_SUCCESS;
+ goto func_exit;
+
+ } else if (shortcut == SEL_EXHAUSTED) {
+
+ mtr_commit(&mtr);
+
+ /* ut_print_name(stderr, index->name);
+ fputs(" record not found 2\n", stderr); */
+
+ if (trx->search_latch_timeout > 0
+ && trx->has_search_latch) {
+
+ trx->search_latch_timeout--;
+
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+ }
+
+ /* NOTE that we do NOT store the cursor
+ position */
+
+ err = DB_RECORD_NOT_FOUND;
+ goto func_exit;
+ }
+shortcut_fails_too_big_rec:
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ }
+ }
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 3: Open or restore index cursor position */
+
+ if (trx->has_search_latch) {
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+ }
+
+ trx_start_if_not_started(trx);
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && prebuilt->select_lock_type != LOCK_NONE
+ && trx->mysql_query_str) {
+
+ /* Scan the MySQL query string; check if SELECT is the first
+ word there */
+
+ dict_accept(*trx->mysql_query_str, "SELECT", &success);
+
+ if (success) {
+ /* It is a plain locking SELECT and the isolation
+ level is low: do not lock gaps */
+
+ set_also_gap_locks = FALSE;
+ }
+ }
+
+ /* Note that if the search mode was GE or G, then the cursor
+ naturally moves upward (in fetch next) in alphabetical order,
+ otherwise downward */
+
+ if (direction == 0) {
+ if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+ moves_up = TRUE;
+ }
+ } else if (direction == ROW_SEL_NEXT) {
+ moves_up = TRUE;
+ }
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ if (direction != 0) {
+ moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ if (!moved) {
+ goto next_rec;
+ }
+
+ } else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+ btr_pcur_open_with_no_init(index, search_tuple, mode,
+ BTR_SEARCH_LEAF,
+ pcur, 0, &mtr);
+
+ pcur->trx_if_known = trx;
+ } else {
+ if (mode == PAGE_CUR_G) {
+ btr_pcur_open_at_index_side(TRUE, index,
+ BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+ } else if (mode == PAGE_CUR_L) {
+ btr_pcur_open_at_index_side(FALSE, index,
+ BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+ }
+ }
+
+ if (!prebuilt->sql_stat_start) {
+ /* No need to set an intention lock or assign a read view */
+
+ if (trx->read_view == NULL
+ && prebuilt->select_lock_type == LOCK_NONE) {
+
+ fputs(
+"InnoDB: Error: MySQL is trying to perform a consistent read\n"
+"InnoDB: but the read view is not assigned!\n", stderr);
+ trx_print(stderr, trx);
+ fputc('\n', stderr);
+ ut_a(0);
+ }
+ } else if (prebuilt->select_lock_type == LOCK_NONE) {
+ /* This is a consistent read */
+ /* Assign a read view for the query */
+
+ trx_assign_read_view(trx);
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ if (prebuilt->select_lock_type == LOCK_S) {
+ err = lock_table(0, index->table, LOCK_IS, thr);
+ } else {
+ err = lock_table(0, index->table, LOCK_IX, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ prebuilt->sql_stat_start = FALSE;
+ }
+
+rec_loop:
+ /*-------------------------------------------------------------*/
+ /* PHASE 4: Look for matching records in a loop */
+
+ rec = btr_pcur_get_rec(pcur);
+ comp = index->table->comp;
+ ut_ad(comp == page_is_comp(buf_frame_align(rec)));
+/*
+ fputs("Using ", stderr);
+ dict_index_name_print(stderr, index);
+ fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
+ buf_frame_get_page_no(buf_frame_align(rec)));
+ rec_print(rec);
+*/
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. */
+
+ goto next_rec;
+ }
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ if (prebuilt->select_lock_type != LOCK_NONE
+ && set_also_gap_locks) {
+
+ /* Try to place a lock on the index record */
+
+ /* If innodb_locks_unsafe_for_binlog option is used,
+ we do not lock gaps. Supremum record is really
+ a gap and therefore we do not set locks there. */
+
+ if (!srv_locks_unsafe_for_binlog) {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(rec, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_ORDINARY, thr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ }
+ /* A page supremum record cannot be in the result set: skip
+ it now that we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* Do sanity checks in case our cursor has bumped into page
+ corruption */
+
+ next_offs = rec_get_next_offs(rec, comp);
+
+ if (next_offs >= UNIV_PAGE_SIZE
+ || next_offs <
+ (ulint) (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)) {
+
+ if (srv_force_recovery == 0 || moves_up == FALSE) {
+ ut_print_timestamp(stderr);
+ buf_page_print(buf_frame_align(rec));
+ fprintf(stderr,
+"\nInnoDB: rec address %p, first buffer frame %p\n"
+"InnoDB: buffer pool high end %p, buf block fix count %lu\n",
+ rec, buf_pool->frame_zero,
+ buf_pool->high_end,
+ (ulong)buf_block_align(rec)->buf_fix_count);
+ fprintf(stderr,
+"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
+"InnoDB: ",
+ (ulong) (rec - buf_frame_align(rec)),
+ (ulong) next_offs,
+ (ulong) buf_frame_get_page_no(rec));
+ dict_index_name_print(stderr, trx, index);
+ fputs(". Run CHECK TABLE. You may need to\n"
+"InnoDB: restore from a backup, or dump + drop + reimport the table.\n",
+ stderr);
+
+ err = DB_CORRUPTION;
+
+ goto lock_wait_or_error;
+ } else {
+ /* The user may be dumping a corrupt table. Jump
+ over the corruption to recover as much as possible. */
+
+ fprintf(stderr,
+"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
+"InnoDB: ",
+ (ulong) (rec - buf_frame_align(rec)),
+ (ulong) next_offs,
+ (ulong) buf_frame_get_page_no(rec));
+ dict_index_name_print(stderr, trx, index);
+ fputs(". We try to skip the rest of the page.\n",
+ stderr);
+
+ btr_pcur_move_to_last_on_page(pcur, &mtr);
+
+ goto next_rec;
+ }
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ if (srv_force_recovery > 0) {
+ if (!rec_validate(rec, offsets)
+ || !btr_index_rec_validate(rec, index, FALSE)) {
+ fprintf(stderr,
+"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
+"InnoDB: ",
+ (ulong) (rec - buf_frame_align(rec)),
+ (ulong) next_offs,
+ (ulong) buf_frame_get_page_no(rec));
+ dict_index_name_print(stderr, trx, index);
+ fputs(". We try to skip the record.\n",
+ stderr);
+
+ goto next_rec;
+ }
+ }
+
+ /*-------------------------------------------------------------*/
+
+ /* Note that we cannot trust the up_match value in the cursor at this
+ place because we can arrive here after moving the cursor! Thus
+ we have to recompare rec and search_tuple to determine if they
+ match enough. */
+
+ if (match_mode == ROW_SEL_EXACT) {
+ /* Test if the index record matches completely to search_tuple
+ in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+ /* fputs("Comparing rec and search tuple\n", stderr); */
+
+ if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+ if (prebuilt->select_lock_type != LOCK_NONE
+ && set_also_gap_locks) {
+
+ /* Try to place a gap lock on the index
+ record only if innodb_locks_unsafe_for_binlog
+ option is not set */
+
+ if (srv_locks_unsafe_for_binlog == FALSE) {
+
+ err = sel_set_rec_lock(rec, index,
+ offsets,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ err = DB_RECORD_NOT_FOUND;
+ /* ut_print_name(stderr, index->name);
+ fputs(" record not found 3\n", stderr); */
+
+ goto normal_return;
+ }
+
+ } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+ if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+
+ if (prebuilt->select_lock_type != LOCK_NONE
+ && set_also_gap_locks) {
+
+ /* Try to place a gap lock on the index
+ record only if innodb_locks_unsafe_for_binlog
+ option is not set */
+
+ if (srv_locks_unsafe_for_binlog == FALSE) {
+
+ err = sel_set_rec_lock(rec, index,
+ offsets,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ err = DB_RECORD_NOT_FOUND;
+ /* ut_print_name(stderr, index->name);
+ fputs(" record not found 4\n", stderr); */
+
+ goto normal_return;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ cons_read_requires_clust_rec = FALSE;
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; note that delete
+ marked records are a special case in a unique search. If there
+ is a non-delete marked record, then it is enough to lock its
+ existence with LOCK_REC_NOT_GAP. */
+
+ ulint lock_type;
+
+ if (!set_also_gap_locks
+ || (unique_search && !rec_get_deleted_flag(rec, comp))) {
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ /* If innodb_locks_unsafe_for_binlog option is used,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+
+ if (srv_locks_unsafe_for_binlog) {
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+ }
+
+ /* If we are doing a 'greater or equal than a primary key
+ value' search from a clustered index, and we find a record
+ that has that exact primary key value, then there is no need
+ to lock the gap before the record, because no insert in the
+ gap can be in our search range. That is, no phantom row can
+ appear that way.
+
+ An example: if col1 is the primary key, the search is WHERE
+ col1 >= 100, and we find a record where col1 = 100, then no
+ need to lock the gap before that record. */
+
+ if (index == clust_index
+ && mode == PAGE_CUR_GE
+ && direction == 0
+ && dtuple_get_n_fields_cmp(search_tuple)
+ == dict_index_get_n_unique(index)
+ && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+ lock_type = LOCK_REC_NOT_GAP;
+ }
+
+ err = sel_set_rec_lock(rec, index, offsets,
+ prebuilt->select_lock_type,
+ lock_type, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+ /* Do nothing: we let a non-locking SELECT read the
+ latest version of the record */
+
+ } else if (index == clust_index) {
+
+ /* Fetch a previous version of the row if the current
+ one is not visible in the snapshot; if we have a very
+ high force recovery level set, we try to avoid crashes
+ by skipping this lookup */
+
+ if (srv_force_recovery < 5
+ && !lock_clust_rec_cons_read_sees(rec, index,
+ offsets, trx->read_view)) {
+
+ err = row_sel_build_prev_vers_for_mysql(
+ trx->read_view, clust_index,
+ prebuilt, rec,
+ &offsets, &heap,
+ &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row did not exist yet in
+ the read view */
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, index,
+ trx->read_view)) {
+ /* We are looking into a non-clustered index,
+ and to get the right version of the record we
+ have to look also into the clustered index: this
+ is necessary, because we can only get the undo
+ information via the clustered index record. */
+
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+
+ if (rec_get_deleted_flag(rec, comp)
+ && !cons_read_requires_clust_rec) {
+
+ /* The record is delete-marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ goto next_rec;
+ }
+
+ /* Get the clustered index record if needed and if we did
+ not do the search using the clustered index */
+
+ index_rec = rec;
+
+ /* Before and after the following "if" block, "offsets" will be
+ related to "rec", which may be in "index", a secondary index or
+ the clustered index ("clust_index"). However, after this "if" block,
+ "rec" may be pointing to "clust_rec" of "clust_index". */
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (index != clust_index && (cons_read_requires_clust_rec
+ || prebuilt->need_to_access_clustered)) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ mtr_has_extra_clust_latch = TRUE;
+
+ err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+ thr, &clust_rec,
+ &offsets, &heap, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, comp)) {
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ if (prebuilt->need_to_access_clustered) {
+ rec = clust_rec;
+ ut_ad(rec_offs_validate(rec, clust_index, offsets));
+ } else {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ }
+ }
+
+ /* We found a qualifying row */
+ ut_ad(rec_offs_validate(rec,
+ rec == clust_rec ? clust_index : index,
+ offsets));
+
+ if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD
+ && prebuilt->select_lock_type == LOCK_NONE
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->clust_index_was_generated
+ && !prebuilt->used_in_HANDLER
+ && prebuilt->template_type
+ != ROW_MYSQL_DUMMY_TEMPLATE) {
+
+ /* Inside an update, for example, we do not cache rows,
+ since we may use the cursor position to do the actual
+ update, that is why we require ...lock_type == LOCK_NONE.
+ Since we keep space in prebuilt only for the BLOBs of
+ a single row, we cannot cache rows in the case there
+ are BLOBs in the fields to be fetched. In HANDLER we do
+ not cache rows because there the cursor is a scrollable
+ cursor. */
+
+ row_sel_push_cache_row_for_mysql(prebuilt, rec, offsets);
+
+ if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
+
+ goto got_row;
+ }
+
+ goto next_rec;
+ } else {
+ if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
+ memcpy(buf + 4, rec - rec_offs_extra_size(offsets),
+ rec_offs_size(offsets));
+ mach_write_to_4(buf,
+ rec_offs_extra_size(offsets) + 4);
+ } else {
+ if (!row_sel_store_mysql_rec(buf, prebuilt,
+ rec, offsets)) {
+ err = DB_TOO_BIG_RECORD;
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (prebuilt->clust_index_was_generated) {
+ if (rec != index_rec) {
+ offsets = rec_get_offsets(
+ index_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ }
+ row_sel_store_row_id_to_prebuilt(prebuilt, index_rec,
+ index, offsets);
+ }
+ }
+got_row:
+ /* We have an optimization to save CPU time: if this is a consistent
+ read on a unique condition on the clustered index, then we do not
+ store the pcur position, because any fetch next or prev will anyway
+ return 'end of file'. An exception is the MySQL HANDLER command
+ where the user can move the cursor with PREV or NEXT even after
+ a unique search. */
+
+ if (!unique_search_from_clust_index
+ || prebuilt->select_lock_type == LOCK_X
+ || prebuilt->used_in_HANDLER) {
+
+ /* Inside an update always store the cursor position */
+
+ btr_pcur_store_position(pcur, &mtr);
+ }
+
+ err = DB_SUCCESS;
+
+ goto normal_return;
+
+next_rec:
+ /*-------------------------------------------------------------*/
+ /* PHASE 5: Move the cursor to the next index record */
+
+ if (mtr_has_extra_clust_latch) {
+ /* We must commit mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ mtr_commit(&mtr);
+ mtr_has_extra_clust_latch = FALSE;
+
+ mtr_start(&mtr);
+ moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ if (moved) {
+ cnt++;
+
+ goto rec_loop;
+ }
+ }
+
+ if (moves_up) {
+ moved = btr_pcur_move_to_next(pcur, &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(pcur, &mtr);
+ }
+
+ if (!moved) {
+ btr_pcur_store_position(pcur, &mtr);
+
+ if (match_mode != 0) {
+ err = DB_RECORD_NOT_FOUND;
+ } else {
+ err = DB_END_OF_INDEX;
+ }
+
+ goto normal_return;
+ }
+
+ cnt++;
+
+ goto rec_loop;
+
+lock_wait_or_error:
+ /*-------------------------------------------------------------*/
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ mtr_commit(&mtr);
+ mtr_has_extra_clust_latch = FALSE;
+
+ trx->error_state = err;
+
+ /* The following is a patch for MySQL */
+
+ que_thr_stop_for_mysql(thr);
+
+ thr->lock_state= QUE_THR_LOCK_ROW;
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+ if (was_lock_wait) {
+ mtr_start(&mtr);
+
+ sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ mode = pcur->search_mode;
+
+ goto rec_loop;
+ }
+
+/* fputs("Using ", stderr);
+ dict_index_name_print(stderr, index);
+ fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+ goto func_exit;
+
+normal_return:
+ /*-------------------------------------------------------------*/
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ mtr_commit(&mtr);
+
+ if (prebuilt->n_fetch_cached > 0) {
+ row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+ err = DB_SUCCESS;
+ }
+
+/* fputs("Using ", stderr);
+ dict_index_name_print(stderr, index);
+ fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+ if (err == DB_SUCCESS) {
+ srv_n_rows_read++;
+ }
+
+func_exit:
+ trx->op_info = "";
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/***********************************************************************
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache. */
+
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+ /* out: TRUE if storing or retrieving
+ from the query cache is permitted */
+ trx_t* trx, /* in: transaction object */
+ const char* norm_name) /* in: concatenation of database name,
+ '/' char, table name */
+{
+ dict_table_t* table;
+ ibool ret = FALSE;
+
+ table = dict_table_get(norm_name, trx);
+
+ if (table == NULL) {
+
+ return(FALSE);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ /* Start the transaction if it is not started yet */
+
+ trx_start_if_not_started_low(trx);
+
+ /* If there are locks on the table or some trx has invalidated the
+ cache up to our trx id, then ret = FALSE.
+ We do not check what type locks there are on the table, though only
+ IX type locks actually would require ret = FALSE. */
+
+ if (UT_LIST_GET_LEN(table->locks) == 0
+ && ut_dulint_cmp(trx->id, table->query_cache_inv_trx_id) >= 0) {
+
+ ret = TRUE;
+
+ /* If the isolation level is high, assign a read view for the
+ transaction if it does not yet have one */
+
+ if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+ && !trx->read_view) {
+
+ trx->read_view = read_view_open_now(trx,
+ trx->read_view_heap);
+ }
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(ret);
+}
diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.c
new file mode 100644
index 00000000000..9dc860d70b1
--- /dev/null
+++ b/storage/innobase/row/row0uins.c
@@ -0,0 +1,308 @@
+/******************************************************
+Fresh insert undo
+
+(c) 1996 Innobase Oy
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*******************************************************************
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached. */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node) /* in: undo node */
+{
+ btr_cur_t* btr_cur;
+ ibool success;
+ ulint err;
+ ulint n_tries = 0;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+ &mtr);
+ ut_a(success);
+
+ if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+ /* Drop the index tree associated with the row in
+ SYS_INDEXES table: */
+
+ dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+ &(node->pcur), &mtr);
+ ut_a(success);
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ if (success) {
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(DB_SUCCESS);
+ }
+retry:
+ /* If did not succeed, try pessimistic descent to tree */
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+ &(node->pcur), &mtr);
+ ut_a(success);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err == DB_OUT_OF_FILE_SPACE
+ && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry if found. */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+ /* out: DB_SUCCESS, DB_FAIL, or
+ DB_OUT_OF_FILE_SPACE */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry to remove */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool found;
+ ibool success;
+ ulint err;
+ mtr_t mtr;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (!found) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree. */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry to insert */
+{
+ ulint err;
+ ulint n_tries = 0;
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+retry:
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+ undo_node_t* node) /* in: row undo node */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ dulint undo_no;
+ dulint table_id;
+ ulint type;
+ ulint dummy;
+ ibool dummy_extern;
+
+ ut_ad(node);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+ &dummy_extern, &undo_no, &table_id);
+ ut_ad(type == TRX_UNDO_INSERT_REC);
+ node->rec_type = type;
+
+ node->table = dict_table_get_on_id(table_id, node->trx);
+
+ if (node->table == NULL) {
+
+ return;
+ }
+
+ if (node->table->ibd_file_missing) {
+ /* We skip undo operations to missing .ibd files */
+ node->table = NULL;
+
+ return;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+}
+
+/***************************************************************
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. */
+
+ulint
+row_undo_ins(
+/*=========*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node) /* in: row undo node */
+{
+ dtuple_t* entry;
+ ibool found;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(node->state == UNDO_NODE_INSERT);
+
+ row_undo_ins_parse_undo_rec(node);
+
+ if (node->table == NULL) {
+ found = FALSE;
+ } else {
+ found = row_undo_search_clust_to_pcur(node);
+ }
+
+ if (!found) {
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(DB_SUCCESS);
+ }
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ while (node->index != NULL) {
+ entry = row_build_index_entry(node->row, node->index,
+ node->heap);
+ err = row_undo_ins_remove_sec(node->index, entry);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ err = row_undo_ins_remove_clust_rec(node);
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c
new file mode 100644
index 00000000000..1cade0f304f
--- /dev/null
+++ b/storage/innobase/row/row0umod.c
@@ -0,0 +1,773 @@
+/******************************************************
+Undo modify of a row
+
+(c) 1997 Innobase Oy
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/***************************************************************
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback. */
+UNIV_INLINE
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+ /* out: TRUE if also previous modify or
+ insert of this row should be undone */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dulint* undo_no)/* out: the undo number */
+{
+ trx_undo_rec_t* undo_rec;
+ ibool ret;
+ trx_t* trx;
+
+ UT_NOT_USED(thr);
+
+ trx = node->trx;
+
+ if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+ return(FALSE);
+ }
+
+ undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+ *undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+ if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) {
+ ret = TRUE;
+ } else {
+ ret = FALSE;
+ }
+
+ return(ret);
+}
+
+/***************************************************************
+Undoes a modify in a clustered index record. */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+ /* out: DB_SUCCESS, DB_FAIL, or error code:
+ we may run out of file space */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr, /* in: mtr */
+ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ big_rec_t* dummy_big_rec;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool success;
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ success = btr_pcur_restore_position(mode, pcur, mtr);
+
+ ut_ad(success);
+
+ if (mode == BTR_MODIFY_LEAF) {
+
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, &dummy_big_rec, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Removes a clustered index record after undo if possible. */
+static
+ulint
+row_undo_mod_remove_clust_low(
+/*==========================*/
+ /* out: DB_SUCCESS, DB_FAIL, or error code:
+ we may run out of file space */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr __attribute__((unused)), /* in: query thread */
+ mtr_t* mtr, /* in: mtr */
+ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool success;
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ success = btr_pcur_restore_position(mode, pcur, mtr);
+
+ if (!success) {
+
+ return(DB_SUCCESS);
+ }
+
+ /* Find out if we can remove the whole clustered index record */
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+ && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+ /* Ok, we can remove */
+ } else {
+ return(DB_SUCCESS);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ /* Note that since this operation is analogous to purge,
+ we can free also inherited externally stored fields:
+ hence the last FALSE in the call below */
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, FALSE, mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo. */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+ /* out: DB_SUCCESS or error code: we may run
+ out of file space */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ btr_pcur_t* pcur;
+ mtr_t mtr;
+ ulint err;
+ ibool success;
+ ibool more_vers;
+ dulint new_undo_no;
+
+ ut_ad(node && thr);
+
+ /* Check if also the previous version of the clustered index record
+ should be undone in this same rollback operation */
+
+ more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no);
+
+ pcur = &(node->pcur);
+
+ mtr_start(&mtr);
+
+ /* Try optimistic processing of the record, keeping changes within
+ the index page */
+
+ err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a pessimistic
+ descent down the index tree */
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+ BTR_MODIFY_LEAF);
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a
+ pessimistic descent down the index tree */
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+ BTR_MODIFY_TREE);
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ }
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ if (more_vers && err == DB_SUCCESS) {
+
+ /* Reserve the undo log record to the prior version after
+ committing &mtr: this is necessary to comply with the latching
+ order, as &mtr may contain the fsp latch which is lower in
+ the latch hierarchy than trx->undo_mutex. */
+
+ success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+ if (success) {
+ node->state = UNDO_NODE_PREV_VERS;
+ }
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found. */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+ /* out: DB_SUCCESS, DB_FAIL, or
+ DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode) /* in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ ibool found;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has;
+ ulint err;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (!found) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ /* We should remove the index record if no prior version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should delete mark the record. */
+
+ mtr_start(&mtr_vers);
+
+ success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+ &mtr_vers);
+ ut_a(success);
+
+ old_has = row_vers_old_has_index_entry(FALSE,
+ btr_pcur_get_rec(&(node->pcur)),
+ &mtr_vers, index, entry);
+ if (old_has) {
+ err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, &mtr);
+ ut_ad(err == DB_SUCCESS);
+ } else {
+ /* Remove the index record */
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ TRUE, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.c, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked. */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ ulint err;
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_LEAF);
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_TREE);
+ return(err);
+}
+
+/***************************************************************
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. */
+static
+ulint
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+ /* out: DB_FAIL or DB_SUCCESS or
+ DB_OUT_OF_FILE_SPACE */
+ ulint mode, /* in: search mode: BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ upd_t* update;
+ ulint err = DB_SUCCESS;
+ ibool found;
+ big_rec_t* dummy_big_rec;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ if (!found) {
+ fputs("InnoDB: error in sec index entry del undo in\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs("\n"
+ "InnoDB: tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+ putc('\n', stderr);
+ trx_print(stderr, trx);
+ fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+ } else {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, FALSE, thr, &mtr);
+ ut_a(err == DB_SUCCESS);
+ heap = mem_heap_create(100);
+
+ update = row_upd_build_sec_rec_difference_binary(index, entry,
+ btr_cur_get_rec(btr_cur), trx, heap);
+ if (upd_get_n_fields(update) == 0) {
+
+ /* Do nothing */
+
+ } else if (mode == BTR_MODIFY_LEAF) {
+ /* Try an optimistic updating of the record, keeping
+ changes within the page */
+
+ err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG
+ | BTR_NO_LOCKING_FLAG,
+ btr_cur, update, 0, thr, &mtr);
+ if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG
+ | BTR_NO_LOCKING_FLAG,
+ btr_cur, &dummy_big_rec,
+ update, 0, thr, &mtr);
+ }
+
+ mem_heap_free(heap);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_DEL. */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ entry = row_build_index_entry(node->row, index, heap);
+
+ err = row_undo_mod_del_mark_or_remove_sec(node, thr, index,
+ entry);
+ if (err != DB_SUCCESS) {
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is DEL_MARK. */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ entry = row_build_index_entry(node->row, index, heap);
+
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF,
+ thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE,
+ thr, index, entry);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ /* No change in secondary indexes */
+
+ return(DB_SUCCESS);
+ }
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ if (row_upd_changes_ord_field_binary(node->row, node->index,
+ node->update)) {
+
+ /* Build the newest version of the index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ /* NOTE that if we updated the fields of a
+ delete-marked secondary index record so that
+ alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc', we cannot return to the original
+ values because we do not know them. But this should
+ not cause problems because in row0sel.c, in queries
+ we always retrieve the clustered index record or an
+ earlier version of it, if the secondary index record
+ through which we do the search is delete-marked. */
+
+ err = row_undo_mod_del_mark_or_remove_sec(node, thr,
+ index, entry);
+ if (err != DB_SUCCESS) {
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ /* We may have to update the delete mark in the
+ secondary index record of the previous version of
+ the row. We also need to update the fields of
+ the secondary index record if we updated its fields
+ but alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc'. */
+
+ row_upd_index_replace_new_col_vals(entry, index,
+ node->update, NULL);
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF,
+ thr, index, entry);
+ if (err == DB_FAIL) {
+ err =
+ row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE,
+ thr, index, entry);
+ }
+
+ if (err != DB_SUCCESS) {
+ mem_heap_free(heap);
+
+ return(err);
+ }
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ dulint undo_no;
+ dulint table_id;
+ dulint trx_id;
+ dulint roll_ptr;
+ ulint info_bits;
+ ulint type;
+ ulint cmpl_info;
+ ibool dummy_extern;
+ trx_t* trx;
+
+ ut_ad(node && thr);
+ trx = thr_get_trx(thr);
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+ node->rec_type = type;
+
+ node->table = dict_table_get_on_id(table_id, trx);
+
+ /* TODO: other fixes associated with DROP TABLE + rollback in the
+ same table by another user */
+
+ if (node->table == NULL) {
+ /* Table was dropped */
+ return;
+ }
+
+ if (node->table->ibd_file_missing) {
+ /* We skip undo operations to missing .ibd files */
+ node->table = NULL;
+
+ return;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits, trx,
+ node->heap, &(node->update));
+ node->new_roll_ptr = roll_ptr;
+ node->new_trx_id = trx_id;
+ node->cmpl_info = cmpl_info;
+}
+
+/***************************************************************
+Undoes a modify operation on a row of a table. */
+
+ulint
+row_undo_mod(
+/*=========*/
+ /* out: DB_SUCCESS or error code */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ibool found;
+ ulint err;
+
+ ut_ad(node && thr);
+ ut_ad(node->state == UNDO_NODE_MODIFY);
+
+ row_undo_mod_parse_undo_rec(node, thr);
+
+ if (node->table == NULL) {
+ found = FALSE;
+ } else {
+ found = row_undo_search_clust_to_pcur(node);
+ }
+
+ if (!found) {
+ /* It is already undone, or will be undone by another query
+ thread, or table was dropped */
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+ node->state = UNDO_NODE_FETCH_NEXT;
+
+ return(DB_SUCCESS);
+ }
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+ err = row_undo_mod_upd_exist_sec(node, thr);
+
+ } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+ err = row_undo_mod_del_mark_sec(node, thr);
+ } else {
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ err = row_undo_mod_upd_del_sec(node, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_clust(node, thr);
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0undo.c b/storage/innobase/row/row0undo.c
new file mode 100644
index 00000000000..abe73cbe705
--- /dev/null
+++ b/storage/innobase/row/row0undo.c
@@ -0,0 +1,350 @@
+/******************************************************
+Row undo
+
+(c) 1997 Innobase Oy
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/************************************************************************
+Creates a row undo node to a query graph. */
+
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ /* out, own: undo node */
+ trx_t* trx, /* in: transaction */
+ que_thr_t* parent, /* in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /* in: memory heap where created */
+{
+ undo_node_t* undo;
+
+ ut_ad(trx && parent && heap);
+
+ undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+ undo->common.type = QUE_NODE_UNDO;
+ undo->common.parent = parent;
+
+ undo->state = UNDO_NODE_FETCH_NEXT;
+ undo->trx = trx;
+
+ btr_pcur_init(&(undo->pcur));
+
+ undo->heap = mem_heap_create(256);
+
+ return(undo);
+}
+
+/***************************************************************
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case. */
+
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ /* out: TRUE if found; NOTE the node->pcur
+ must be closed by the caller, regardless of
+ the return value */
+ undo_node_t* node) /* in: row undo node */
+{
+ dict_index_t* clust_index;
+ ibool found;
+ mtr_t mtr;
+ ibool ret;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ mtr_start(&mtr);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+ node->table, node->ref, &mtr);
+
+ rec = btr_pcur_get_rec(&(node->pcur));
+
+ offsets = rec_get_offsets(rec, clust_index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+ row_get_rec_roll_ptr(rec, clust_index, offsets))) {
+
+ /* We must remove the reservation on the undo log record
+ BEFORE releasing the latch on the clustered index page: this
+ is to make sure that some thread will eventually undo the
+ modification corresponding to node->roll_ptr. */
+
+ /* fputs("--------------------undoing a previous version\n",
+ stderr); */
+
+ ret = FALSE;
+ } else {
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+ offsets, node->heap);
+ btr_pcur_store_position(&(node->pcur), &mtr);
+
+ ret = TRUE;
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(ret);
+}
+
+/***************************************************************
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_undo(
+/*=====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+ trx_t* trx;
+ dulint roll_ptr;
+ ibool froze_data_dict = FALSE;
+
+ ut_ad(node && thr);
+
+ trx = node->trx;
+
+ if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+ node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+ trx->roll_limit,
+ &roll_ptr,
+ node->heap);
+ if (!node->undo_rec) {
+ /* Rollback completed for this query thread */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(DB_SUCCESS);
+ }
+
+ node->roll_ptr = roll_ptr;
+ node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ node->state = UNDO_NODE_INSERT;
+ } else {
+ node->state = UNDO_NODE_MODIFY;
+ }
+
+ } else if (node->state == UNDO_NODE_PREV_VERS) {
+
+ /* Undo should be done to the same clustered index record
+ again in this same rollback, restoring the previous version */
+
+ roll_ptr = node->new_roll_ptr;
+
+ node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+ node->heap);
+ node->roll_ptr = roll_ptr;
+ node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ node->state = UNDO_NODE_INSERT;
+ } else {
+ node->state = UNDO_NODE_MODIFY;
+ }
+ }
+
+ /* Prevent DROP TABLE etc. while we are rolling back this row.
+ If we are doing a TABLE CREATE or some other dictionary operation,
+ then we already have dict_operation_lock locked in x-mode. Do not
+ try to lock again in s-mode, because that would cause a hang. */
+
+ if (trx->dict_operation_lock_mode == 0) {
+
+ row_mysql_freeze_data_dictionary(trx);
+
+ froze_data_dict = TRUE;
+ }
+
+ if (node->state == UNDO_NODE_INSERT) {
+
+ err = row_undo_ins(node);
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+ } else {
+ ut_ad(node->state == UNDO_NODE_MODIFY);
+ err = row_undo_mod(node, thr);
+ }
+
+ if (froze_data_dict) {
+
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ /* Do some cleanup */
+ btr_pcur_close(&(node->pcur));
+
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(err);
+}
+
+/***************************************************************
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs. */
+
+que_thr_t*
+row_undo_step(
+/*==========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+ undo_node_t* node;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ srv_activity_count++;
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+ err = row_undo(node, thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* SQL error detected */
+
+ fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
+ (ulong) err);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ fprintf(stderr,
+ "InnoDB: Error 13 means out of tablespace.\n"
+ "InnoDB: Consider increasing your tablespace.\n");
+
+ exit(1);
+ }
+
+ ut_error;
+
+ return(NULL);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c
new file mode 100644
index 00000000000..3305724a89b
--- /dev/null
+++ b/storage/innobase/row/row0upd.c
@@ -0,0 +1,2035 @@
+/******************************************************
+Update of a row
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+ -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+ Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/***************************************************************
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes. */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ /* out: TRUE if changes */
+ dtuple_t* entry, /* in: old value of index entry */
+ dict_index_t* index, /* in: index of entry */
+ upd_t* update, /* in: update vector for the row */
+ ulint n); /* in: how many first fields to check */
+
+
+/*************************************************************************
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint. */
+static
+ibool
+row_upd_index_is_referenced(
+/*========================*/
+ /* out: TRUE if referenced; NOTE that since
+ we do not hold dict_operation_lock
+ when leaving the function, it may be that
+ the referencing table has been dropped when
+ we leave this function: this function is only
+ for heuristic use! */
+ dict_index_t* index, /* in: index */
+ trx_t* trx) /* in: transaction */
+{
+ dict_table_t* table = index->table;
+ dict_foreign_t* foreign;
+ ibool froze_data_dict = FALSE;
+
+ if (!UT_LIST_GET_FIRST(table->referenced_list)) {
+
+ return(FALSE);
+ }
+
+ if (trx->dict_operation_lock_mode == 0) {
+ row_mysql_freeze_data_dictionary(trx);
+ froze_data_dict = TRUE;
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign) {
+ if (foreign->referenced_index == index) {
+
+ if (froze_data_dict) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ return(TRUE);
+ }
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (froze_data_dict) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur. NOTE that this function will temporarily commit mtr and lose the
+pcur position! */
+static
+ulint
+row_upd_check_references_constraints(
+/*=================================*/
+ /* out: DB_SUCCESS or an error code */
+ upd_node_t* node, /* in: row update node */
+ btr_pcur_t* pcur, /* in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /* in: table in question */
+ dict_index_t* index, /* in: index of the cursor */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ trx_t* trx;
+ rec_t* rec;
+ ulint err;
+ ibool got_s_lock = FALSE;
+
+ if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx = thr_get_trx(thr);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ if (trx->dict_operation_lock_mode == 0) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign) {
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->referenced_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(entry, index,
+ node->update, foreign->n_fields))) {
+
+ if (foreign->foreign_table == NULL) {
+ dict_table_get(foreign->foreign_table_name,
+ trx);
+ }
+
+ if (foreign->foreign_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ (foreign->foreign_table
+ ->n_foreign_key_checks_running)++;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_operation_lock temporarily!
+ But the counter on the table protects 'foreign' from
+ being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(FALSE, foreign,
+ table, entry, thr);
+
+ if (foreign->foreign_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ ut_a(foreign->foreign_table
+ ->n_foreign_key_checks_running > 0);
+
+ (foreign->foreign_table
+ ->n_foreign_key_checks_running)--;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ if (err != DB_SUCCESS) {
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(
+ trx);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+ }
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************************
+Creates an update node for a query graph. */
+
+upd_node_t*
+upd_node_create(
+/*============*/
+ /* out, own: update node */
+ mem_heap_t* heap) /* in: mem heap where created */
+{
+ upd_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(upd_node_t));
+ node->common.type = QUE_NODE_UPDATE;
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ node->select_will_do_update = FALSE;
+ node->in_mysql_interface = FALSE;
+
+ node->row = NULL;
+ node->ext_vec = NULL;
+ node->index = NULL;
+ node->update = NULL;
+
+ node->foreign = NULL;
+ node->cascade_heap = NULL;
+ node->cascade_node = NULL;
+
+ node->select = NULL;
+
+ node->heap = mem_heap_create(128);
+ node->magic_n = UPD_NODE_MAGIC_N;
+
+ node->cmpl_info = 0;
+
+ return(node);
+}
+
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+ rec_t* rec, /* in: record */
+ const ulint* offsets,/* in: array returned by rec_get_offsets() */
+ ulint pos, /* in: TRX_ID position in rec */
+ dulint trx_id, /* in: transaction id */
+ dulint roll_ptr)/* in: roll ptr of the undo log record */
+{
+ byte* field;
+ ulint len;
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(field, trx_id);
+
+ field = rec_get_nth_field(rec, offsets, pos + 1, &len);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(field, roll_ptr);
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field of a clustered index entry. */
+
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+ dtuple_t* entry, /* in: index entry, where the memory buffers
+ for sys fields are already allocated:
+ the function just copies the new values to
+ them */
+ dict_index_t* index, /* in: clustered index */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ dulint val) /* in: value to write */
+{
+ dfield_t* dfield;
+ byte* field;
+ ulint pos;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ dfield = dtuple_get_nth_field(entry, pos);
+ field = dfield_get_data(dfield);
+
+ if (type == DATA_TRX_ID) {
+ trx_write_trx_id(field, val);
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+ trx_write_roll_ptr(field, val);
+ }
+}
+
+/***************************************************************
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update. */
+
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ /* out: TRUE if the update changes the size of
+ some field in index or the field is external
+ in rec or update */
+ dict_index_t* index, /* in: index */
+ const ulint* offsets,/* in: rec_get_offsets(rec, index) */
+ upd_t* update) /* in: update vector */
+{
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint old_len;
+ ulint new_len;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ new_val = &(upd_field->new_val);
+ new_len = new_val->len;
+
+ if (new_len == UNIV_SQL_NULL && !rec_offs_comp(offsets)) {
+ /* A bug fixed on Dec 31st, 2004: we looked at the
+ SQL NULL size from the wrong field! We may backport
+ this fix also to 4.0. The merge to 5.0 will be made
+ manually immediately after we commit this to 4.1. */
+
+ new_len = dtype_get_sql_null_size(
+ dict_index_get_nth_type(index,
+ upd_field->field_no));
+ }
+
+ old_len = rec_offs_nth_size(offsets, upd_field->field_no);
+
+ if (old_len != new_len) {
+
+ return(TRUE);
+ }
+
+ if (rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+ return(TRUE);
+ }
+
+ if (upd_field->extern_storage) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. This function is used only for
+a clustered index */
+
+void
+row_upd_rec_in_place(
+/*=================*/
+ rec_t* rec, /* in/out: record where replaced */
+ const ulint* offsets,/* in: array returned by rec_get_offsets() */
+ upd_t* update) /* in: update vector */
+{
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits);
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+
+ rec_set_nth_field(rec, offsets, upd_field->field_no,
+ dfield_get_data(new_val),
+ dfield_get_len(new_val));
+ }
+}
+
+/*************************************************************************
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record. */
+
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+ /* out: new pointer to mlog */
+ dict_index_t* index, /* in: clustered index */
+ trx_t* trx, /* in: transaction */
+ dulint roll_ptr,/* in: roll ptr of the undo log record */
+ byte* log_ptr,/* pointer to a buffer of size > 20 opened
+ in mlog */
+ mtr_t* mtr __attribute__((unused))) /* in: mtr */
+{
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(mtr);
+
+ log_ptr += mach_write_compressed(log_ptr,
+ dict_index_get_sys_col_pos(index, DATA_TRX_ID));
+
+ trx_write_roll_ptr(log_ptr, roll_ptr);
+ log_ptr += DATA_ROLL_PTR_LEN;
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+ return(log_ptr);
+}
+
+/*************************************************************************
+Parses the log data of system field values. */
+
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+ /* out: log data end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ ulint* pos, /* out: TRX_ID position in record */
+ dulint* trx_id, /* out: trx id */
+ dulint* roll_ptr)/* out: roll ptr */
+{
+ ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+ return(NULL);
+ }
+
+ *roll_ptr = trx_read_roll_ptr(ptr);
+ ptr += DATA_ROLL_PTR_LEN;
+
+ ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+ return(ptr);
+}
+
+/***************************************************************
+Writes to the redo log the new values of the fields occurring in the index. */
+
+void
+row_upd_index_write_log(
+/*====================*/
+ upd_t* update, /* in: update vector */
+ byte* log_ptr,/* in: pointer to mlog buffer: must contain at least
+ MLOG_BUF_MARGIN bytes of free space; the buffer is
+ closed within this function */
+ mtr_t* mtr) /* in: mtr into whose log to write */
+{
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint len;
+ ulint n_fields;
+ byte* buf_end;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+
+ mach_write_to_1(log_ptr, update->info_bits);
+ log_ptr++;
+ log_ptr += mach_write_compressed(log_ptr, n_fields);
+
+ for (i = 0; i < n_fields; i++) {
+
+ ut_ad(MLOG_BUF_MARGIN > 30);
+
+ if (log_ptr + 30 > buf_end) {
+ mlog_close(mtr, log_ptr);
+
+ log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+ }
+
+ upd_field = upd_get_nth_field(update, i);
+
+ new_val = &(upd_field->new_val);
+
+ len = new_val->len;
+
+ log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+ log_ptr += mach_write_compressed(log_ptr, len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (log_ptr + len < buf_end) {
+ ut_memcpy(log_ptr, new_val->data, len);
+
+ log_ptr += len;
+ } else {
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, new_val->data, len);
+
+ log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+ }
+ }
+ }
+
+ mlog_close(mtr, log_ptr);
+}
+
+/*************************************************************************
+Parses the log data written by row_upd_index_write_log. */
+
+byte*
+row_upd_index_parse(
+/*================*/
+ /* out: log data end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ mem_heap_t* heap, /* in: memory heap where update vector is
+ built */
+ upd_t** update_out)/* out: update vector */
+{
+ upd_t* update;
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint len;
+ ulint n_fields;
+ byte* buf;
+ ulint info_bits;
+ ulint i;
+
+ if (end_ptr < ptr + 1) {
+
+ return(NULL);
+ }
+
+ info_bits = mach_read_from_1(ptr);
+ ptr++;
+ ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ update = upd_create(n_fields, heap);
+ update->info_bits = info_bits;
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+
+ ptr = mach_parse_compressed(ptr, end_ptr,
+ &(upd_field->field_no));
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ new_val->len = len;
+
+ if (len != UNIV_SQL_NULL) {
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ } else {
+ buf = mem_heap_alloc(heap, len);
+ ut_memcpy(buf, ptr, len);
+
+ ptr += len;
+
+ new_val->data = buf;
+ }
+ }
+ }
+
+ *update_out = update;
+
+ return(ptr);
+}
+
+/*******************************************************************
+Returns TRUE if ext_vec contains i. */
+static
+ibool
+upd_ext_vec_contains(
+/*=================*/
+ /* out: TRUE if i is in ext_vec */
+ ulint* ext_vec, /* in: array of indexes or NULL */
+ ulint n_ext_vec, /* in: number of numbers in ext_vec */
+ ulint i) /* in: a number */
+{
+ ulint j;
+
+ if (ext_vec == NULL) {
+
+ return(FALSE);
+ }
+
+ for (j = 0; j < n_ext_vec; j++) {
+ if (ext_vec[j] == i) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings! */
+
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ /* out, own: update vector of differing
+ fields */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: entry to insert */
+ rec_t* rec, /* in: secondary index record */
+ trx_t* trx, /* in: transaction */
+ mem_heap_t* heap) /* in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ ulint i;
+ ulint offsets_[REC_OFFS_SMALL_SIZE];
+ const ulint* offsets;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ /* This function is used only for a secondary index */
+ ut_a(0 == (index->type & DICT_CLUSTERED));
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE that it may be that len != dfield_get_len(dfield) if we
+ are updating in a character set and collation where strings of
+ different length can be equal in an alphabetical comparison,
+ and also in the case where we have a column prefix index
+ and the last characters in the index field are spaces; the
+ latter case probably caused the assertion failures reported at
+ row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+
+ if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index, trx);
+
+ upd_field->extern_storage = FALSE;
+
+ n_diff++;
+ }
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+/*******************************************************************
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings! */
+
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+ /* out, own: update vector of differing
+ fields, excluding roll ptr and trx id */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t* entry, /* in: entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
+ rec_t* rec, /* in: clustered index record */
+ trx_t* trx, /* in: transaction */
+ mem_heap_t* heap) /* in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ ulint roll_ptr_pos;
+ ulint trx_id_pos;
+ ibool extern_bit;
+ ulint i;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ /* This function is used only for a clustered index */
+ ut_a(index->type & DICT_CLUSTERED);
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+
+ roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+ trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+
+ if (i == trx_id_pos || i == roll_ptr_pos) {
+
+ goto skip_compare;
+ }
+
+ extern_bit = rec_offs_nth_extern(offsets, i);
+
+ if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i)
+ || !dfield_data_is_binary_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index, trx);
+
+ if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) {
+ upd_field->extern_storage = TRUE;
+ } else {
+ upd_field->extern_storage = FALSE;
+ }
+
+ n_diff++;
+ }
+skip_compare:
+ ;
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ dict_index_t* index, /* in: index; NOTE that this may also be a
+ non-clustered index */
+ upd_t* update, /* in: an update vector built for the index so
+ that the field number in an upd_field is the
+ index position */
+ mem_heap_t* heap) /* in: memory heap to which we allocate and
+ copy the new values, set this as NULL if you
+ do not want allocation */
+{
+ dict_field_t* field;
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ dfield_t* new_val;
+ ulint j;
+ ulint i;
+ dtype_t* cur_type;
+
+ ut_ad(index);
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (j = 0; j < dict_index_get_n_fields(index); j++) {
+
+ field = dict_index_get_nth_field(index, j);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ if (upd_field->field_no == j) {
+
+ dfield = dtuple_get_nth_field(entry, j);
+
+ new_val = &(upd_field->new_val);
+
+ dfield_set_data(dfield, new_val->data,
+ new_val->len);
+ if (heap && new_val->len != UNIV_SQL_NULL) {
+ dfield->data = mem_heap_alloc(heap,
+ new_val->len);
+ ut_memcpy(dfield->data, new_val->data,
+ new_val->len);
+ }
+
+ if (field->prefix_len > 0
+ && new_val->len != UNIV_SQL_NULL) {
+
+ cur_type = dict_col_get_type(
+ dict_field_get_col(field));
+
+ dfield->len =
+ dtype_get_at_most_n_mbchars(
+ cur_type,
+ field->prefix_len,
+ new_val->len,
+ new_val->data);
+ }
+ }
+ }
+ }
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ dict_index_t* index, /* in: index; NOTE that this may also be a
+ non-clustered index */
+ upd_t* update, /* in: an update vector built for the
+ CLUSTERED index so that the field number in
+ an upd_field is the clustered index position */
+ mem_heap_t* heap) /* in: memory heap to which we allocate and
+ copy the new values, set this as NULL if you
+ do not want allocation */
+{
+ dict_field_t* field;
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ dfield_t* new_val;
+ ulint j;
+ ulint i;
+ dtype_t* cur_type;
+
+ ut_ad(index);
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (j = 0; j < dict_index_get_n_fields(index); j++) {
+
+ field = dict_index_get_nth_field(index, j);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ if (upd_field->field_no == field->col->clust_pos) {
+
+ dfield = dtuple_get_nth_field(entry, j);
+
+ new_val = &(upd_field->new_val);
+
+ dfield_set_data(dfield, new_val->data,
+ new_val->len);
+ if (heap && new_val->len != UNIV_SQL_NULL) {
+ dfield->data = mem_heap_alloc(heap,
+ new_val->len);
+ ut_memcpy(dfield->data, new_val->data,
+ new_val->len);
+ }
+
+ if (field->prefix_len > 0
+ && new_val->len != UNIV_SQL_NULL) {
+
+ cur_type = dict_col_get_type(
+ dict_field_get_col(field));
+
+ dfield->len =
+ dtype_get_at_most_n_mbchars(
+ cur_type,
+ field->prefix_len,
+ new_val->len,
+ new_val->data);
+ }
+ }
+ }
+ }
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings! */
+
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+ /* out: TRUE if update vector changes
+ an ordering field in the index record;
+ NOTE: the fields are compared as binary
+ strings */
+ dtuple_t* row, /* in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ dict_index_t* index, /* in: index of the record */
+ upd_t* update) /* in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+{
+ upd_field_t* upd_field;
+ dict_field_t* ind_field;
+ dict_col_t* col;
+ ulint n_unique;
+ ulint n_upd_fields;
+ ulint col_pos;
+ ulint col_no;
+ ulint i, j;
+
+ ut_ad(update && index);
+
+ n_unique = dict_index_get_n_unique(index);
+ n_upd_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_unique; i++) {
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col);
+ col_no = dict_col_get_no(col);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ upd_field = upd_get_nth_field(update, j);
+
+ /* Note that if the index field is a column prefix
+ then it may be that row does not contain an externally
+ stored part of the column value, and we cannot compare
+ the datas */
+
+ if (col_pos == upd_field->field_no
+ && (row == NULL
+ || ind_field->prefix_len > 0
+ || !dfield_datas_are_binary_equal(
+ dtuple_get_nth_field(row, col_no),
+ &(upd_field->new_val)))) {
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings! */
+
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ /* out: TRUE if update vector may change
+ an ordering field in an index record */
+ dict_table_t* table, /* in: table */
+ upd_t* update) /* in: update vector for the row */
+{
+ upd_field_t* upd_field;
+ dict_index_t* index;
+ ulint i;
+
+ index = dict_table_get_first_index(table);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ if (dict_field_get_col(dict_index_get_nth_field(index,
+ upd_field->field_no))
+ ->ord_part) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes. */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ /* out: TRUE if changes */
+ dtuple_t* entry, /* in: index entry */
+ dict_index_t* index, /* in: index of entry */
+ upd_t* update, /* in: update vector for the row */
+ ulint n) /* in: how many first fields to check */
+{
+ upd_field_t* upd_field;
+ dict_field_t* ind_field;
+ dict_col_t* col;
+ ulint n_upd_fields;
+ ulint col_pos;
+ ulint i, j;
+
+ ut_a(update && index);
+ ut_a(n <= dict_index_get_n_fields(index));
+
+ n_upd_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n; i++) {
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col);
+
+ ut_a(ind_field->prefix_len == 0);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ upd_field = upd_get_nth_field(update, j);
+
+ if (col_pos == upd_field->field_no
+ && !dfield_datas_are_binary_equal(
+ dtuple_get_nth_field(entry, i),
+ &(upd_field->new_val))) {
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+ rec_t* rec, /* in: record in a clustered index */
+ const ulint* offsets,/* in: array returned by rec_get_offsets() */
+ sym_node_t* column) /* in: first column in a column list, or
+ NULL */
+{
+ byte* data;
+ ulint len;
+
+ while (column) {
+ data = rec_get_nth_field(rec, offsets,
+ column->field_nos[SYM_CLUST_FIELD_NO],
+ &len);
+ eval_node_copy_and_alloc_val(column, data, len);
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*************************************************************************
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+ upd_t* update) /* in: update vector */
+{
+ que_node_t* exp;
+ upd_field_t* upd_field;
+ ulint n_fields;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ exp = upd_field->exp;
+
+ eval_exp(exp);
+
+ dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+ }
+}
+
+/***************************************************************
+Stores to the heap the row on which the node->pcur is positioned. */
+static
+void
+row_upd_store_row(
+/*==============*/
+ upd_node_t* node) /* in: row update node */
+{
+ dict_index_t* clust_index;
+ upd_t* update;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+ if (node->row != NULL) {
+ mem_heap_empty(node->heap);
+ node->row = NULL;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ rec = btr_pcur_get_rec(node->pcur);
+
+ offsets = rec_get_offsets(rec, clust_index, offsets_,
+ ULINT_UNDEFINED, &heap);
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+ node->heap);
+ node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint)
+ * rec_offs_n_fields(offsets));
+ if (node->is_delete) {
+ update = NULL;
+ } else {
+ update = node->update;
+ }
+
+ node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec,
+ offsets, update);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************
+Updates a secondary index entry of a row. */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ibool check_ref;
+ ibool found;
+ dict_index_t* index;
+ dtuple_t* entry;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ mem_heap_t* heap;
+ rec_t* rec;
+ ulint err = DB_SUCCESS;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+
+ index = node->index;
+
+ check_ref = row_upd_index_is_referenced(index, trx);
+
+ heap = mem_heap_create(1024);
+
+ /* Build old index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+ &mtr);
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ rec = btr_cur_get_rec(btr_cur);
+
+ if (!found) {
+ fputs("InnoDB: error in sec index entry update in\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs("\n"
+ "InnoDB: tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+
+ trx_print(stderr, trx);
+
+ fputs("\n"
+"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr);
+ } else {
+ /* Delete mark the old index record; it can already be
+ delete marked if we return after a lock wait in
+ row_ins_index_entry below */
+
+ if (!rec_get_deleted_flag(rec, index->table->comp)) {
+ err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
+ thr, &mtr);
+ if (err == DB_SUCCESS && check_ref) {
+
+ /* NOTE that the following call loses
+ the position of pcur ! */
+ err = row_upd_check_references_constraints(
+ node,
+ &pcur, index->table,
+ index, thr, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto close_cur;
+ }
+ }
+
+ }
+ }
+close_cur:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (node->is_delete || err != DB_SUCCESS) {
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ /* Build a new index entry */
+ row_upd_index_replace_new_col_vals(entry, index, node->update, NULL);
+
+ /* Insert new index entry */
+ err = row_ins_index_entry(index, entry, NULL, 0, thr);
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***************************************************************
+Updates secondary index record if it is changed in the row update. This
+should be quite rare in database applications. */
+UNIV_INLINE
+ulint
+row_upd_sec_step(
+/*=============*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+ || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+ ut_ad(!(node->index->type & DICT_CLUSTERED));
+
+ if (node->state == UPD_NODE_UPDATE_ALL_SEC
+ || row_upd_changes_ord_field_binary(node->row, node->index,
+ node->update)) {
+ err = row_upd_sec_index_entry(node, thr);
+
+ return(err);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications. */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ dict_index_t* index, /* in: clustered index of the record */
+ que_thr_t* thr, /* in: query thread */
+ ibool check_ref,/* in: TRUE if index may be referenced in
+ a foreign key constraint */
+ mtr_t* mtr) /* in: mtr; gets committed here */
+{
+ mem_heap_t* heap = NULL;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ trx_t* trx;
+ dict_table_t* table;
+ dtuple_t* entry;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ trx = thr_get_trx(thr);
+ table = node->table;
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, mtr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+ return(err);
+ }
+
+ /* Mark as not-owned the externally stored fields which the new
+ row inherits from the delete marked record: purge should not
+ free those externally stored fields even if the delete marked
+ record is removed from the index tree, or updated. */
+
+ btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur),
+ rec_get_offsets(btr_cur_get_rec(btr_cur),
+ dict_table_get_first_index(table), offsets_,
+ ULINT_UNDEFINED, &heap), node->update, mtr);
+ if (check_ref) {
+ /* NOTE that the following call loses
+ the position of pcur ! */
+ err = row_upd_check_references_constraints(node,
+ pcur, table,
+ index, thr, mtr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+ }
+ }
+
+ }
+
+ mtr_commit(mtr);
+
+ if (!heap) {
+ heap = mem_heap_create(500);
+ }
+ node->state = UPD_NODE_INSERT_CLUSTERED;
+
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_upd_index_replace_new_col_vals(entry, index, node->update, NULL);
+
+ row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+
+ /* If we return from a lock wait, for example, we may have
+ extern fields marked as not-owned in entry (marked in the
+ if-branch above). We must unmark them. */
+
+ btr_cur_unmark_dtuple_extern_fields(entry, node->ext_vec,
+ node->n_ext_vec);
+ /* We must mark non-updated extern fields in entry as inherited,
+ so that a possible rollback will not free them */
+
+ btr_cur_mark_dtuple_inherited_extern(entry, node->ext_vec,
+ node->n_ext_vec,
+ node->update);
+
+ err = row_ins_index_entry(index, entry, node->ext_vec,
+ node->n_ext_vec, thr);
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***************************************************************
+Updates a clustered index record of a row when the ordering fields do
+not change. */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ dict_index_t* index, /* in: clustered index */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr; gets committed here */
+{
+ big_rec_t* big_rec = NULL;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ index->table->comp));
+
+ /* Try optimistic updating of the record, keeping changes within
+ the page; we do not check locks because we assume the x-lock on the
+ record to update */
+
+ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+ err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ } else {
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+
+ mtr_commit(mtr);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* We may have to modify the tree structure: do a pessimistic descent
+ down the index tree */
+
+ mtr_start(mtr);
+
+ /* NOTE: this transaction has an s-lock or x-lock on the record and
+ therefore other transactions cannot modify the record when we have no
+ latch on the page. In addition, we assume that other query threads of
+ the same transaction do not modify the record in the meantime.
+ Therefore we can assert that the restoration of the cursor succeeds. */
+
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ index->table->comp));
+
+ err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+ &big_rec, node->update,
+ node->cmpl_info, thr, mtr);
+ mtr_commit(mtr);
+
+ if (err == DB_SUCCESS && big_rec) {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_t* rec;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ mtr_start(mtr);
+
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+ rec = btr_cur_get_rec(btr_cur);
+ err = btr_store_big_rec_extern_fields(index, rec,
+ rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ big_rec, mtr);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ mtr_commit(mtr);
+ }
+
+ if (big_rec) {
+ dtuple_big_rec_free(big_rec);
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Delete marks a clustered index record. */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code */
+ upd_node_t* node, /* in: row update node */
+ dict_index_t* index, /* in: clustered index */
+ que_thr_t* thr, /* in: query thread */
+ ibool check_ref,/* in: TRUE if index may be referenced in
+ a foreign key constraint */
+ mtr_t* mtr) /* in: mtr; gets committed here */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(node->is_delete);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ /* Store row because we have to build also the secondary index
+ entries */
+
+ row_upd_store_row(node);
+
+ /* Mark the clustered index record deleted; we do not have to check
+ locks, because we assume that we have an x-lock on the record */
+
+ err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, mtr);
+ if (err == DB_SUCCESS && check_ref) {
+ /* NOTE that the following call loses the position of pcur ! */
+
+ err = row_upd_check_references_constraints(node,
+ pcur, index->table,
+ index, thr, mtr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+
+ return(err);
+ }
+ }
+
+ mtr_commit(mtr);
+
+ return(err);
+}
+
+/***************************************************************
+Updates the clustered index record. */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, DB_LOCK_WAIT in case of a lock wait,
+ else error code */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ ibool success;
+ ibool check_ref;
+ ulint err;
+ mtr_t* mtr;
+ mtr_t mtr_buf;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ index = dict_table_get_first_index(node->table);
+
+ check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
+
+ pcur = node->pcur;
+
+ /* We have to restore the cursor to its position */
+ mtr = &mtr_buf;
+
+ mtr_start(mtr);
+
+ /* If the restoration does not succeed, then the same
+ transaction has deleted the record on which the cursor was,
+ and that is an SQL error. If the restoration succeeds, it may
+ still be that the same transaction has successively deleted
+ and inserted a record with the same ordering fields, but in
+ that case we know that the transaction has at least an
+ implicit x-lock on the record. */
+
+ ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+ if (!success) {
+ err = DB_RECORD_NOT_FOUND;
+
+ mtr_commit(mtr);
+
+ return(err);
+ }
+
+ /* If this is a row in SYS_INDEXES table of the data dictionary,
+ then we have to free the file segments of the index tree associated
+ with the index */
+
+ if (node->is_delete
+ && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+ dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+ mtr);
+ if (!success) {
+ err = DB_ERROR;
+
+ mtr_commit(mtr);
+
+ return(err);
+ }
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ if (!node->has_clust_rec_x_lock) {
+ err = lock_clust_rec_modify_check_and_lock(0,
+ rec, index, offsets, thr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+ goto exit_func;
+ }
+ }
+
+ /* NOTE: the following function calls will also commit mtr */
+
+ if (node->is_delete) {
+ err = row_upd_del_mark_clust_rec(node, index, thr, check_ref,
+ mtr);
+ if (err == DB_SUCCESS) {
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+ node->index = dict_table_get_next_index(index);
+ }
+ exit_func:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(err);
+ }
+
+ /* If the update is made for MySQL, we already have the update vector
+ ready, else we have to do some evaluation: */
+
+ if (!node->in_mysql_interface) {
+ /* Copy the necessary columns from clust_rec and calculate the
+ new values to set */
+ row_upd_copy_columns(rec, offsets,
+ UT_LIST_GET_FIRST(node->columns));
+ row_upd_eval_new_vals(node->update);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+ err = row_upd_clust_rec(node, index, thr, mtr);
+ return(err);
+ }
+
+ row_upd_store_row(node);
+
+ if (row_upd_changes_ord_field_binary(node->row, index, node->update)) {
+
+ /* Update causes an ordering field (ordering fields within
+ the B-tree) of the clustered index record to change: perform
+ the update by delete marking and inserting.
+
+ TODO! What to do to the 'Halloween problem', where an update
+ moves the record forward in index so that it is again
+ updated when the cursor arrives there? Solution: the
+ read operation must check the undo record undo number when
+ choosing records to update. MySQL solves now the problem
+ externally! */
+
+ err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
+ mtr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+ } else {
+ err = row_upd_clust_rec(node, index, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_SOME_SEC;
+ }
+
+ node->index = dict_table_get_next_index(index);
+
+ return(err);
+}
+
+/***************************************************************
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor. */
+static
+ulint
+row_upd(
+/*====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err = DB_SUCCESS;
+
+ ut_ad(node && thr);
+
+ if (node->in_mysql_interface) {
+
+ /* We do not get the cmpl_info value from the MySQL
+ interpreter: we must calculate it on the fly: */
+
+ if (node->is_delete ||
+ row_upd_changes_some_index_ord_field_binary(
+ node->table, node->update)) {
+ node->cmpl_info = 0;
+ } else {
+ node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+ }
+ }
+
+ if (node->state == UPD_NODE_UPDATE_CLUSTERED
+ || node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+ err = row_upd_clust_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+ }
+
+ if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+ goto function_exit;
+ }
+
+ while (node->index != NULL) {
+ err = row_upd_sec_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+function_exit:
+ if (err == DB_SUCCESS) {
+ /* Do some cleanup */
+
+ if (node->row != NULL) {
+ node->row = NULL;
+ node->n_ext_vec = 0;
+ mem_heap_empty(node->heap);
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_upd_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ upd_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* parent;
+ ulint err = DB_SUCCESS;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ trx_start_if_not_started(trx);
+
+ node = thr->run_node;
+
+ sel_node = node->select;
+
+ parent = que_node_get_parent(node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ if (thr->prev_node == parent) {
+ node->state = UPD_NODE_SET_IX_LOCK;
+ }
+
+ if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+ if (!node->has_clust_rec_x_lock) {
+ /* It may be that the current session has not yet
+ started its transaction, or it has been committed: */
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ if (node->searched_update) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to update */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ /* sel_node is NULL if we are in the MySQL interface */
+
+ if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+ if (!node->searched_update) {
+ /* An explicit cursor should be positioned on a row
+ to update */
+
+ ut_error;
+
+ err = DB_ERROR;
+
+ goto error_handling;
+ }
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to update, or the select node performed the
+ updates directly in-place */
+
+ thr->run_node = parent;
+
+ return(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_upd(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->searched_update) {
+ /* Fetch next row to update */
+
+ thr->run_node = sel_node;
+ } else {
+ /* It was an explicit cursor update */
+
+ thr->run_node = parent;
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ return(thr);
+}
+
+/*************************************************************************
+Performs an in-place update for the current clustered index record in
+select. */
+
+void
+row_upd_in_place_in_select(
+/*=======================*/
+ sel_node_t* sel_node, /* in: select node */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ upd_node_t* node;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ *offsets_ = (sizeof offsets_) / sizeof *offsets_;
+
+ ut_ad(sel_node->select_will_do_update);
+ ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF);
+ ut_ad(sel_node->asc);
+
+ node = que_node_get_parent(sel_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ /* Copy the necessary columns from clust_rec and calculate the new
+ values to set */
+
+ row_upd_copy_columns(btr_pcur_get_rec(pcur), rec_get_offsets(
+ btr_pcur_get_rec(pcur), btr_cur->index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ UT_LIST_GET_FIRST(node->columns));
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ row_upd_eval_new_vals(node->update);
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ btr_cur->index->table->comp));
+
+ ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE);
+ ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+ ut_ad(node->select_will_do_update);
+
+ err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur,
+ node->update, node->cmpl_info,
+ thr, mtr);
+ ut_ad(err == DB_SUCCESS);
+}
diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c
new file mode 100644
index 00000000000..36f6c27636d
--- /dev/null
+++ b/storage/innobase/row/row0vers.c
@@ -0,0 +1,492 @@
+/******************************************************
+Row versions
+
+(c) 1997 Innobase Oy
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "lock0lock.h"
+
+/*********************************************************************
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function! */
+
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+ /* out: NULL if committed, else the active
+ transaction; NOTE that the kernel mutex is
+ temporarily released! */
+ rec_t* rec, /* in: record in a secondary index */
+ dict_index_t* index, /* in: the secondary index */
+ const ulint* offsets)/* in: rec_get_offsets(rec, index) */
+{
+ dict_index_t* clust_index;
+ rec_t* clust_rec;
+ ulint* clust_offsets;
+ rec_t* version;
+ rec_t* prev_version;
+ dulint trx_id;
+ dulint prev_trx_id;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ dtuple_t* entry = NULL; /* assignment to eliminate compiler
+ warning */
+ trx_t* trx;
+ ibool vers_del;
+ ibool rec_del;
+ ulint err;
+ mtr_t mtr;
+ ibool comp;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ /* Search for the clustered index record: this is a time-consuming
+ operation: therefore we release the kernel mutex; also, the release
+ is required by the latching order convention. The latch on the
+ clustered index locks the top of the stack of versions. We also
+ reserve purge_latch to lock the bottom of the version stack. */
+
+ clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+ &clust_index, &mtr);
+ if (!clust_rec) {
+ /* In a rare case it is possible that no clust rec is found
+ for a secondary index record: if in row0umod.c
+ row_undo_mod_remove_clust_low() we have already removed the
+ clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case there cannot be
+ any implicit lock on the secondary index record, because
+ an active transaction which has modified the secondary index
+ record has also modified the clustered index record. And in
+ a rollback we always undo the modifications to secondary index
+ records before the clustered index record. */
+
+ mutex_enter(&kernel_mutex);
+ mtr_commit(&mtr);
+
+ return(NULL);
+ }
+
+ heap = mem_heap_create(1024);
+ clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+ ULINT_UNDEFINED, &heap);
+ trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+
+ mtr_s_lock(&(purge_sys->latch), &mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ trx = NULL;
+ if (!trx_is_active(trx_id)) {
+ /* The transaction that modified or inserted clust_rec is no
+ longer active: no implicit lock on rec */
+ goto exit_func;
+ }
+
+ if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
+ clust_offsets, TRUE)) {
+ /* Corruption noticed: try to avoid a crash by returning */
+ goto exit_func;
+ }
+
+ comp = index->table->comp;
+ ut_ad(index->table == clust_index->table);
+ ut_ad(comp == page_is_comp(buf_frame_align(rec)));
+ ut_ad(comp == page_is_comp(buf_frame_align(clust_rec)));
+
+ /* We look up if some earlier version, which was modified by the trx_id
+ transaction, of the clustered index record would require rec to be in
+ a different state (delete marked or unmarked, or have different field
+ values, or not existing). If there is such a version, then rec was
+ modified by the trx_id transaction, and it has an implicit x-lock on
+ rec. Note that if clust_rec itself would require rec to be in a
+ different state, then the trx_id transaction has not yet had time to
+ modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+ rec_del = rec_get_deleted_flag(rec, comp);
+ trx = NULL;
+
+ version = clust_rec;
+
+ for (;;) {
+ mutex_exit(&kernel_mutex);
+
+ /* While we retrieve an earlier version of clust_rec, we
+ release the kernel mutex, because it may take time to access
+ the disk. After the release, we have to check if the trx_id
+ transaction is still active. We keep the semaphore in mtr on
+ the clust_rec page, so that no other transaction can update
+ it and get an implicit x-lock on rec. */
+
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+ clust_index, clust_offsets, heap,
+ &prev_version);
+ mem_heap_free(heap2); /* free version and clust_offsets */
+
+ if (prev_version) {
+ clust_offsets = rec_get_offsets(prev_version,
+ clust_index, NULL,
+ ULINT_UNDEFINED, &heap);
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, clust_offsets, heap);
+ entry = row_build_index_entry(row, index, heap);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ if (!trx_is_active(trx_id)) {
+ /* Transaction no longer active: no implicit x-lock */
+
+ break;
+ }
+
+ /* If the transaction is still active, the previous version
+ of clust_rec must be accessible if not a fresh insert; we
+ may assert the following: */
+
+ ut_ad(err == DB_SUCCESS);
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version: there is an
+ implicit x-lock on rec */
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ /* If we get here, we know that the trx_id transaction is
+ still active and it has modified prev_version. Let us check
+ if prev_version would require rec to be in a different
+ state. */
+
+ vers_del = rec_get_deleted_flag(prev_version, comp);
+
+ /* We check if entry and rec are identified in the alphabetical
+ ordering */
+ if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+ /* The delete marks of rec and prev_version should be
+ equal for rec to be in the state required by
+ prev_version */
+
+ if (rec_del != vers_del) {
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ /* It is possible that the row was updated so that the
+ secondary index record remained the same in
+ alphabetical ordering, but the field values changed
+ still. For example, 'abc' -> 'ABC'. Check also that. */
+
+ dtuple_set_types_binary(entry,
+ dtuple_get_n_fields(entry));
+ if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+ } else if (!rec_del) {
+ /* The delete mark should be set in rec for it to be
+ in the state required by prev_version */
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+ clust_offsets);
+
+ if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+ /* The versions modified by the trx_id transaction end
+ to prev_version: no implicit x-lock */
+
+ break;
+ }
+
+ version = prev_version;
+ }/* for (;;) */
+
+exit_func:
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(trx);
+}
+
+/*********************************************************************
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view. */
+
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+ /* out: TRUE if earlier version should be preserved */
+ dulint trx_id, /* in: transaction id in the version */
+ mtr_t* mtr) /* in: mtr holding the latch on the clustered index
+ record; it will also hold the latch on purge_view */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mtr_s_lock(&(purge_sys->latch), mtr);
+
+ if (trx_purge_update_undo_must_exist(trx_id)) {
+
+ /* A purge operation is not yet allowed to remove this
+ delete marked record */
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry and ientry are identified in
+the alphabetical ordering; exactly in this case we return TRUE. */
+
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+ /* out: TRUE if earlier version should have */
+ ibool also_curr,/* in: TRUE if also rec is included in the
+ versions to search; otherwise only versions
+ prior to it are searched */
+ rec_t* rec, /* in: record in the clustered index; the
+ caller must have a latch on the page */
+ mtr_t* mtr, /* in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /* in: the secondary index */
+ dtuple_t* ientry) /* in: the secondary index entry */
+{
+ rec_t* version;
+ rec_t* prev_version;
+ dict_index_t* clust_index;
+ ulint* clust_offsets;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ dtuple_t* entry;
+ ulint err;
+ ibool comp;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains(mtr, buf_block_align(rec),
+ MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ mtr_s_lock(&(purge_sys->latch), mtr);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ comp = index->table->comp;
+ ut_ad(comp == page_is_comp(buf_frame_align(rec)));
+ heap = mem_heap_create(1024);
+ clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ ULINT_UNDEFINED, &heap);
+
+ if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, clust_offsets, heap);
+ entry = row_build_index_entry(row, index, heap);
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because the row is maybe being modified so that
+ the clustered index record has already been updated
+ to a different binary value in a char field, but the
+ collation identifies the old and new value anyway! */
+
+ if (dtuple_datas_are_ordering_equal(ientry, entry)) {
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+ }
+
+ version = rec;
+
+ for (;;) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ err = trx_undo_prev_version_build(rec, mtr, version,
+ clust_index, clust_offsets, heap,
+ &prev_version);
+ mem_heap_free(heap2); /* free version and clust_offsets */
+
+ if (err != DB_SUCCESS || !prev_version) {
+ /* Versions end here */
+
+ mem_heap_free(heap);
+
+ return(FALSE);
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL, ULINT_UNDEFINED, &heap);
+
+ if (!rec_get_deleted_flag(prev_version, comp)) {
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, clust_offsets, heap);
+ entry = row_build_index_entry(row, index, heap);
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because maybe the secondary index record has
+ already been updated to a different binary value in
+ a char field, but the collation identifies the old
+ and new value anyway! */
+
+ if (dtuple_datas_are_ordering_equal(ientry, entry)) {
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+ }
+
+ version = prev_version;
+ }
+}
+
+/*********************************************************************
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version. */
+
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+ /* out: DB_SUCCESS or DB_MISSING_HISTORY */
+ rec_t* rec, /* in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /* in: mtr holding the latch on rec */
+ dict_index_t* index, /* in: the clustered index */
+ ulint** offsets,/* in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ read_view_t* view, /* in: the consistent read view */
+ mem_heap_t** offset_heap,/* in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/* in: memory heap from which the memory for
+ old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers)/* out, own: old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+{
+ rec_t* version;
+ rec_t* prev_version;
+ dulint prev_trx_id;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ ulint err;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains(mtr, buf_block_align(rec),
+ MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ ut_ad(!read_view_sees_trx_id(view,
+ row_get_rec_trx_id(rec, index, *offsets)));
+
+ rw_lock_s_lock(&(purge_sys->latch));
+ version = rec;
+
+ for (;;) {
+ mem_heap_t* heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ err = trx_undo_prev_version_build(rec, mtr, version, index,
+ *offsets, heap, &prev_version);
+ if (heap2) {
+ mem_heap_free(heap2); /* free version */
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ *offsets = rec_get_offsets(prev_version, index, *offsets,
+ ULINT_UNDEFINED, offset_heap);
+ prev_trx_id = row_get_rec_trx_id(prev_version, index,
+ *offsets);
+
+ if (read_view_sees_trx_id(view, prev_trx_id)) {
+
+ /* The view already sees this version: we can copy
+ it to in_heap and return */
+
+ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+ *old_vers = rec_copy(buf, prev_version, *offsets);
+ rec_offs_make_valid(*old_vers, index, *offsets);
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ version = prev_version;
+ }/* for (;;) */
+
+ mem_heap_free(heap);
+ rw_lock_s_unlock(&(purge_sys->latch));
+
+ return(err);
+}