summaryrefslogtreecommitdiff
path: root/innobase/row
diff options
context:
space:
mode:
authorunknown <monty@donna.mysql.com>2001-02-17 14:19:19 +0200
committerunknown <monty@donna.mysql.com>2001-02-17 14:19:19 +0200
commit2662b59306ef0cd495fa6e2edf7129e58a11393a (patch)
treebfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/row
parent66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff)
downloadmariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz
Added Innobase to source distribution
Docs/manual.texi: Added Innobase documentation configure.in: Incremented version include/my_base.h: Added option for Innobase myisam/mi_check.c: cleanup mysql-test/t/bdb.test: cleanup mysql-test/t/innobase.test: Extended with new tests from bdb.test mysql-test/t/merge.test: Added test of SHOW create mysys/my_init.c: Fix for UNIXWARE 7 scripts/mysql_install_db.sh: Always write how to start mysqld scripts/safe_mysqld.sh: Fixed type sql/ha_innobase.cc: Update to new version sql/ha_innobase.h: Update to new version sql/handler.h: Added 'update_table_comment()' and 'append_create_info()' sql/sql_delete.cc: Fixes for Innobase sql/sql_select.cc: Fixes for Innobase sql/sql_show.cc: Append create information (for MERGE tables) sql/sql_update.cc: Fixes for Innobase
Diffstat (limited to 'innobase/row')
-rw-r--r--innobase/row/Makefile.am25
-rw-r--r--innobase/row/makefilewin34
-rw-r--r--innobase/row/row0ins.c1018
-rw-r--r--innobase/row/row0mysql.c1116
-rw-r--r--innobase/row/row0purge.c553
-rw-r--r--innobase/row/row0row.c652
-rw-r--r--innobase/row/row0sel.c2732
-rw-r--r--innobase/row/row0uins.c308
-rw-r--r--innobase/row/row0umod.c608
-rw-r--r--innobase/row/row0undo.c313
-rw-r--r--innobase/row/row0upd.c1394
-rw-r--r--innobase/row/row0vers.c409
-rw-r--r--innobase/row/ts/makefile16
-rw-r--r--innobase/row/ts/tstcur.c1087
14 files changed, 10265 insertions, 0 deletions
diff --git a/innobase/row/Makefile.am b/innobase/row/Makefile.am
new file mode 100644
index 00000000000..e4fcbe8f715
--- /dev/null
+++ b/innobase/row/Makefile.am
@@ -0,0 +1,25 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+# & Innobase Oy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+include ../include/Makefile.i
+
+libs_LIBRARIES = librow.a
+
+librow_a_SOURCES = row0ins.c row0mysql.c row0purge.c row0row.c row0sel.c\
+ row0uins.c row0umod.c row0undo.c row0upd.c row0vers.c
+
+EXTRA_PROGRAMS =
diff --git a/innobase/row/makefilewin b/innobase/row/makefilewin
new file mode 100644
index 00000000000..c17240c6119
--- /dev/null
+++ b/innobase/row/makefilewin
@@ -0,0 +1,34 @@
+include ..\include\makefile.i
+
+row.lib: row0mysql.obj row0upd.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+ lib -out:..\libs\row.lib row0mysql.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+
+row0mysql.obj: row0mysql.c
+ $(CCOM) $(CFL) -c row0mysql.c
+
+row0ins.obj: row0ins.c
+ $(CCOM) $(CFL) -c row0ins.c
+
+row0sel.obj: row0sel.c
+ $(CCOM) $(CFL) -c row0sel.c
+
+row0upd.obj: row0upd.c
+ $(CCOM) $(CFL) -c row0upd.c
+
+row0undo.obj: row0undo.c
+ $(CCOM) $(CFL) -c row0undo.c
+
+row0purge.obj: row0purge.c
+ $(CCOM) $(CFL) -c row0purge.c
+
+row0row.obj: row0row.c
+ $(CCOM) $(CFL) -c row0row.c
+
+row0vers.obj: row0vers.c
+ $(CCOM) $(CFL) -c row0vers.c
+
+row0umod.obj: row0umod.c
+ $(CCOM) $(CFL) -c row0umod.c
+
+row0uins.obj: row0uins.c
+ $(CCOM) $(CFL) -c row0uins.c
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
new file mode 100644
index 00000000000..4502cb8235f
--- /dev/null
+++ b/innobase/row/row0ins.c
@@ -0,0 +1,1018 @@
+/******************************************************
+Insert into a table
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+
+#define ROW_INS_PREV 1
+#define ROW_INS_NEXT 2
+
+/*************************************************************************
+Creates an insert node struct. */
+
+ins_node_t*
+ins_node_create(
+/*============*/
+ /* out, own: insert node struct */
+ ulint ins_type, /* in: INS_VALUES, ... */
+ dict_table_t* table, /* in: table where to insert */
+ mem_heap_t* heap) /* in: mem heap where created */
+{
+ ins_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+ node->common.type = QUE_NODE_INSERT;
+
+ node->ins_type = ins_type;
+
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->table = table;
+ node->index = NULL;
+ node->entry = NULL;
+
+ node->select = NULL;
+
+ node->trx_id = ut_dulint_zero;
+
+ node->entry_sys_heap = mem_heap_create(128);
+
+ node->magic_n = INS_NODE_MAGIC_N;
+
+ return(node);
+}
+
+/***************************************************************
+Creates an entry template for each index of a table. */
+static
+void
+ins_node_create_entry_list(
+/*=======================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ dict_index_t* index;
+ dtuple_t* entry;
+
+ ut_ad(node->entry_sys_heap);
+
+ UT_LIST_INIT(node->entry_list);
+
+ index = dict_table_get_first_index(node->table);
+
+ while (index != NULL) {
+ entry = row_build_index_entry(node->row, index,
+ node->entry_sys_heap);
+ UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+ index = dict_table_get_next_index(index);
+ }
+}
+
+/*********************************************************************
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+ ins_node_t* node) /* in: insert node */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ mem_heap_t* heap;
+ dict_col_t* col;
+ dfield_t* dfield;
+ ulint len;
+ byte* ptr;
+
+ row = node->row;
+ table = node->table;
+ heap = node->entry_sys_heap;
+
+ ut_ad(row && table && heap);
+ ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+ /* 1. Allocate buffer for row id */
+
+ col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+ node->row_id_buf = ptr;
+
+ if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
+
+ /* 2. Fill in the dfield for mix id */
+
+ col = dict_table_get_sys_col(table, DATA_MIX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ len = mach_dulint_get_compressed_size(table->mix_id);
+ ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN);
+
+ mach_dulint_write_compressed(ptr, table->mix_id);
+ dfield_set_data(dfield, ptr, len);
+ }
+
+ /* 3. Allocate buffer for trx id */
+
+ col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+ node->trx_id_buf = ptr;
+
+ /* 4. Allocate buffer for roll ptr */
+
+ col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*************************************************************************
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /* in: insert node */
+ dtuple_t* row) /* in: new row (or first row) for the node */
+{
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->index = NULL;
+ node->entry = NULL;
+
+ node->row = row;
+
+ mem_heap_empty(node->entry_sys_heap);
+
+ /* Create templates for index entries */
+
+ ins_node_create_entry_list(node);
+
+ /* Allocate from entry_sys_heap buffers for sys fields */
+
+ row_ins_alloc_sys_fields(node);
+
+ /* As we allocated a new trx id buf, the trx id should be written
+ there again: */
+
+ node->trx_id = ut_dulint_zero;
+}
+
+/***********************************************************************
+Does an insert operation by updating a delete marked existing record
+in the index. This situation can occur if the delete marked record is
+kept in the index for consistent reads. */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+ /* out: DB_SUCCESS or error code */
+ btr_cur_t* cursor, /* in: B-tree cursor */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint err;
+
+ ut_ad(((cursor->index)->type & DICT_CLUSTERED) == 0);
+ ut_ad(rec_get_deleted_flag(btr_cur_get_rec(cursor)));
+
+ /* We just remove the delete mark from the secondary index record */
+ err = btr_cur_del_mark_set_sec_rec(0, cursor, FALSE, thr, mtr);
+
+ return(err);
+}
+
+/***********************************************************************
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads. */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+ /* out: DB_SUCCESS, DB_FAIL, or error code */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /* in: B-tree cursor */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ mem_heap_t* heap;
+ rec_t* rec;
+ upd_t* update;
+ ulint err;
+
+ ut_ad((cursor->index)->type & DICT_CLUSTERED);
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(rec_get_deleted_flag(rec));
+
+ heap = mem_heap_create(1024);
+
+ /* Build an update vector containing all the fields to be modified;
+ NOTE that this vector may contain also system columns! */
+
+ update = row_upd_build_difference(cursor->index, entry, rec, heap);
+
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+ mtr);
+ if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_update(0, cursor, update, 0, thr,
+ mtr);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+ /* out: TRUE if error */
+ rec_t* rec, /* in: user record */
+ dtuple_t* entry, /* in: entry to insert */
+ dict_index_t* index, /* in: index */
+ trx_t* trx) /* in: inserting transaction */
+{
+ ulint matched_fields;
+ ulint matched_bytes;
+ ulint n_unique;
+ trx_t* impl_trx;
+
+ n_unique = dict_index_get_n_unique(index);
+
+ matched_fields = 0;
+ matched_bytes = 0;
+
+ cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
+
+ if (matched_fields < n_unique) {
+
+ return(FALSE);
+ }
+
+ if (!rec_get_deleted_flag(rec)) {
+
+ return(TRUE);
+ }
+
+ /* If we get here, the record has its delete mark set. It is still
+ a unique key violation if the transaction which set the delete mark
+ is currently active and is not trx itself. We check if some
+ transaction has an implicit x-lock on the record. */
+
+ mutex_enter(&kernel_mutex);
+
+ if (index->type & DICT_CLUSTERED) {
+ impl_trx = lock_clust_rec_some_has_impl(rec, index);
+ } else {
+ impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (impl_trx && impl_trx != trx) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry. */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+ /* out: DB_SUCCESS or DB_DUPLICATE_KEY */
+ dict_index_t* index, /* in: non-clustered unique index */
+ dtuple_t* entry, /* in: index entry */
+ trx_t* trx) /* in: inserting transaction */
+{
+ ulint dupl_count = 0;
+ int cmp;
+ ulint n_fields_cmp;
+ rec_t* rec;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+
+ btr_pcur_open_on_user_rec(index, entry, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ /* Scan index records and check that there are no duplicates */
+
+ for (;;) {
+ if (btr_pcur_is_after_last_in_tree(&pcur, &mtr)) {
+
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ cmp = cmp_dtuple_rec(entry, rec);
+
+ if (cmp == 0) {
+ if (row_ins_dupl_error_with_rec(rec, entry, index,
+ trx)) {
+ dupl_count++;
+
+ if (dupl_count > 1) {
+ /* printf(
+ "Duplicate key in index %s\n",
+ index->name);
+ dtuple_print(entry); */
+ }
+ }
+ }
+
+ if (cmp < 0) {
+ break;
+ }
+
+ ut_a(cmp == 0);
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+ ut_a(dupl_count >= 1);
+
+ if (dupl_count > 1) {
+
+ return(DB_DUPLICATE_KEY);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************
+Tries to check if a unique key violation error would occur at an index entry
+insert. */
+static
+ulint
+row_ins_duplicate_error(
+/*====================*/
+ /* out: DB_SUCCESS if no error
+ DB_DUPLICATE_KEY if error,
+ DB_STRONG_FAIL if this is a non-clustered
+ index record and we cannot determine yet
+ if there will be an error: in this last
+ case we must call
+ row_ins_scan_sec_index_for_duplicate
+ AFTER the insertion of the record! */
+ btr_cur_t* cursor, /* in: B-tree cursor */
+ dtuple_t* entry, /* in: entry to insert */
+ trx_t* trx, /* in: inserting transaction */
+ mtr_t* mtr, /* in: mtr */
+ rec_t** dupl_rec)/* out: record with which duplicate error */
+{
+ rec_t* rec;
+ page_t* page;
+ ulint n_unique;
+
+ ut_ad(cursor->index->type & DICT_UNIQUE);
+
+ /* NOTE: For unique non-clustered indexes there may be any number
+ of delete marked records with the same value for the non-clustered
+ index key (remember multiversioning), and which differ only in
+ the row refererence part of the index record, containing the
+ clustered index key fields. For such a secondary index record,
+ to avoid race condition, we must FIRST do the insertion and after
+ that check that the uniqueness condition is not breached! */
+
+ /* NOTE: A problem is that in the B-tree node pointers on an
+ upper level may match more to the entry than the actual existing
+ user records on the leaf level. So, even if low_match would suggest
+ that a duplicate key violation may occur, this may not be the case. */
+
+ n_unique = dict_index_get_n_unique(cursor->index);
+
+ if (cursor->low_match >= n_unique) {
+
+ rec = btr_cur_get_rec(cursor);
+ page = buf_frame_align(rec);
+
+ if (rec != page_get_infimum_rec(page)) {
+
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ cursor->index, trx)) {
+ *dupl_rec = rec;
+
+ return(DB_DUPLICATE_KEY);
+ }
+ }
+ }
+
+ if (cursor->up_match >= n_unique) {
+
+ rec = page_rec_get_next(btr_cur_get_rec(cursor));
+ page = buf_frame_align(rec);
+
+ if (rec != page_get_supremum_rec(page)) {
+
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ cursor->index, trx)) {
+ *dupl_rec = rec;
+
+ return(DB_DUPLICATE_KEY);
+ }
+ }
+
+ ut_a(!(cursor->index->type & DICT_CLUSTERED));
+ /* This should never happen */
+ }
+
+ if (cursor->index->type & DICT_CLUSTERED) {
+
+ return(DB_SUCCESS);
+ }
+
+ /* It was a non-clustered index: we must scan the index after the
+ insertion to be sure if there will be duplicate key error */
+
+ return(DB_STRONG_FAIL);
+}
+
+/*******************************************************************
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal. */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+ /* out: 0 if no update, ROW_INS_PREV if
+ previous should be updated; currently we
+ do the search so that only the low_match
+ record can match enough to the search tuple,
+ not the next record */
+ btr_cur_t* cursor) /* in: B-tree cursor */
+{
+ ulint enough_match;
+ rec_t* rec;
+ page_t* page;
+
+ /* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+ pointers on upper levels of the B-tree may match more to entry than
+ to actual user records on the leaf level, we have to check if the
+ candidate record is actually a user record. In a clustered index
+ node pointers contain index->n_unique first fields, and in the case
+ of a secondary index, all fields of the index. */
+
+ enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+
+ if (cursor->low_match >= enough_match) {
+
+ rec = btr_cur_get_rec(cursor);
+ page = buf_frame_align(rec);
+
+ if (rec != page_get_infimum_rec(page)) {
+
+ return(ROW_INS_PREV);
+ }
+ }
+
+ return(0);
+}
+
+/*******************************************************************
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index. */
+
+ulint
+row_ins_index_entry_low(
+/*====================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL
+ if pessimistic retry needed, or error code */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr) /* in: query thread */
+{
+ btr_cur_t cursor;
+ ulint dupl = DB_SUCCESS;
+ ulint modify;
+ rec_t* dummy_rec;
+ rec_t* rec;
+ rec_t* dupl_rec; /* Note that this may be undefined
+ for a non-clustered index even if
+ there is a duplicate key */
+ ulint err;
+ ulint n_unique;
+ mtr_t mtr;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ cursor.thr = thr;
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ mode | BTR_INSERT, &cursor, 0, &mtr);
+
+ if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+ /* The insertion was made to the insert buffer already during
+ the search: we are done */
+
+ err = DB_SUCCESS;
+
+ goto function_exit;
+ }
+
+ n_unique = dict_index_get_n_unique(index);
+
+ if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique
+ || cursor.low_match >= n_unique)) {
+
+ dupl = row_ins_duplicate_error(&cursor, entry,
+ thr_get_trx(thr), &mtr, &dupl_rec);
+ if (dupl == DB_DUPLICATE_KEY) {
+
+ /* printf("Duplicate key in index %s lm %lu\n",
+ cursor->index->name, cursor->low_match);
+ rec_print(rec);
+ dtuple_print(entry); */
+
+ err = dupl;
+
+ goto function_exit;
+ }
+ }
+
+ modify = row_ins_must_modify(&cursor);
+
+ if (modify != 0) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+
+ if (modify == ROW_INS_NEXT) {
+ rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+ btr_cur_position(index, rec, &cursor);
+ }
+
+ if (index->type & DICT_CLUSTERED) {
+ err = row_ins_clust_index_entry_by_modify(mode,
+ &cursor, entry,
+ thr, &mtr);
+ } else {
+ err = row_ins_sec_index_entry_by_modify(&cursor,
+ thr, &mtr);
+ }
+
+ } else if (mode == BTR_MODIFY_LEAF) {
+ err = btr_cur_optimistic_insert(0, &cursor, entry,
+ &dummy_rec, thr, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_insert(0, &cursor, entry,
+ &dummy_rec, thr, &mtr);
+ }
+function_exit:
+ mtr_commit(&mtr);
+
+ if (err == DB_SUCCESS && dupl == DB_STRONG_FAIL) {
+ /* We were not able to determine before the insertion
+ whether there will be a duplicate key error: do the check
+ now */
+
+ err = row_ins_scan_sec_index_for_duplicate(index, entry,
+ thr_get_trx(thr));
+ }
+
+ ut_ad(err != DB_DUPLICATE_KEY || index->type & DICT_CLUSTERED
+ || DB_DUPLICATE_KEY ==
+ row_ins_scan_sec_index_for_duplicate(index, entry,
+ thr_get_trx(thr)));
+ return(err);
+}
+
+/*******************************************************************
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record. */
+
+ulint
+row_ins_index_entry(
+/*================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DUPLICATE_KEY, or some other error code */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr);
+
+ if (err != DB_FAIL) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+
+ err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr);
+
+ return(err);
+}
+
+/***************************************************************
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+UNIV_INLINE
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+ dtuple_t* entry, /* in: index entry to make */
+ dtuple_t* row) /* in: row */
+{
+ dfield_t* field;
+ dfield_t* row_field;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(entry && row);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(entry, i);
+
+ row_field = dtuple_get_nth_field(row, field->col_no);
+
+ field->data = row_field->data;
+ field->len = row_field->len;
+ }
+}
+
+/***************************************************************
+Inserts a single index entry to the table. */
+UNIV_INLINE
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ ins_node_t* node, /* in: row insert node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(dtuple_check_typed(node->row));
+
+ row_ins_index_entry_set_vals(node->entry, node->row);
+
+ ut_ad(dtuple_check_typed(node->entry));
+
+ err = row_ins_index_entry(node->index, node->entry, thr);
+
+ return(err);
+}
+
+/***************************************************************
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ dulint row_id;
+
+ ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+ if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) {
+
+ /* No row id is stored if the clustered index is unique */
+
+ return;
+ }
+
+ /* Fill in row id value to row */
+
+ row_id = dict_sys_get_new_row_id();
+
+ dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***************************************************************
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->values_list;
+
+ while (list_node) {
+ eval_exp(list_node);
+
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***************************************************************
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+ ins_node_t* node) /* in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->select->select_list;
+
+ while (list_node) {
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***************************************************************
+Inserts a row to a table. */
+
+ulint
+row_ins(
+/*====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ ins_node_t* node, /* in: row insert node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(node && thr);
+
+ if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+ row_ins_alloc_row_id_step(node);
+
+ node->index = dict_table_get_first_index(node->table);
+ node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+ if (node->ins_type == INS_SEARCHED) {
+
+ row_ins_get_row_from_select(node);
+
+ } else if (node->ins_type == INS_VALUES) {
+
+ row_ins_get_row_from_values(node);
+ }
+
+ node->state = INS_NODE_INSERT_ENTRIES;
+ }
+
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+ while (node->index != NULL) {
+ err = row_ins_index_entry_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+ }
+
+ ut_ad(node->entry == NULL);
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_ins_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ ins_node_t* node;
+ que_node_t* parent;
+ sel_node_t* sel_node;
+ trx_t* trx;
+ ulint err;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+ parent = que_node_get_parent(node);
+ sel_node = node->select;
+
+ if (thr->prev_node == parent) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ }
+
+ /* If this is the first time this node is executed (or when
+ execution resumes after wait for the table IX lock), set an
+ IX lock on the table and reset the possible select node. */
+
+ if (node->state == INS_NODE_SET_IX_LOCK) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(trx);
+
+ if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+ /* No need to do IX-locking or write trx id to buf */
+
+ goto same_trx;
+ }
+
+ trx_write_trx_id(node->trx_id_buf, trx->id);
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+
+ node->trx_id = trx->id;
+ same_trx:
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ if ((node->ins_type == INS_SEARCHED)
+ && (sel_node->state != SEL_NODE_FETCH)) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to insert */
+ thr->run_node = parent;
+
+ return(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_ins(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
new file mode 100644
index 00000000000..13d84ffd358
--- /dev/null
+++ b/innobase/row/row0mysql.c
@@ -0,0 +1,1116 @@
+/******************************************************
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+(c) 2000 Innobase Oy
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+
+/***********************************************************************
+Reads a MySQL format variable-length field (like VARCHAR) length and
+returns pointer to the field data. */
+
+byte*
+row_mysql_read_var_ref_noninline(
+/*=============================*/
+ /* out: field + 2 */
+ ulint* len, /* out: variable-length field length */
+ byte* field) /* in: field */
+{
+ return(row_mysql_read_var_ref(len, field));
+}
+
+/***********************************************************************
+Stores a reference to a BLOB in the MySQL format. */
+
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /* in: where to store */
+ ulint col_len, /* in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ this may vary from 1 to 4 bytes */
+ byte* data, /* in: BLOB data */
+ ulint len) /* in: BLOB length */
+{
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+ ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*));
+}
+
+/***********************************************************************
+Reads a reference to a BLOB in the MySQL format. */
+
+byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ /* out: pointer to BLOB data */
+ ulint* len, /* out: BLOB length */
+ byte* ref, /* in: BLOB reference in the MySQL format */
+ ulint col_len) /* in: BLOB reference length (not BLOB
+ length) */
+{
+ byte* data;
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ ut_memcpy((byte*)&data, ref + col_len - 8, sizeof(byte*));
+
+ return(data);
+}
+
+/******************************************************************
+Convert a row in the MySQL format to a row in the Innobase format. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+ dtuple_t* row, /* in/out: Innobase row where the
+ field type information is already
+ copied there, or will be copied
+ later */
+ byte* buf, /* in/out: buffer to use in converting
+ data in columns; this must be at least
+ the size of mysql_rec! */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct where template
+ must be of type ROW_MYSQL_WHOLE_ROW */
+ byte* mysql_rec) /* in: row in the MySQL format;
+ NOTE: do not discard as long as
+ row is used, as row may contain
+ pointers to this record! */
+{
+ mysql_row_templ_t* templ;
+ dfield_t* dfield;
+ ulint i;
+
+ ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(prebuilt->mysql_template);
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+ dfield = dtuple_get_nth_field(row, i);
+
+ if (templ->mysql_null_bit_mask != 0) {
+ /* Column may be SQL NULL */
+
+ if (mysql_rec[templ->mysql_null_byte_offset] &
+ (byte) (templ->mysql_null_bit_mask)) {
+
+ /* It is SQL NULL */
+
+ dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+ goto next_column;
+ }
+ }
+
+ row_mysql_store_col_in_innobase_format(dfield,
+ prebuilt->ins_upd_rec_buff
+ + templ->mysql_col_offset,
+ mysql_rec + templ->mysql_col_offset,
+ templ->mysql_col_len,
+ templ->type, templ->is_unsigned);
+next_column:
+ ;
+ }
+}
+
+/********************************************************************
+Handles user errors and lock waits detected by the database engine. */
+
+ibool
+row_mysql_handle_errors(
+/*====================*/
+ /* out: TRUE if it was a lock wait and
+ we should continue running the query thread */
+ ulint* new_err,/* out: possible new error encountered in
+ rollback, or the old error which was
+ during the function entry */
+ trx_t* trx, /* in: transaction */
+ que_thr_t* thr, /* in: query thread */
+ trx_savept_t* savept) /* in: savepoint */
+{
+ ibool timeout_expired;
+ ulint err;
+
+handle_new_error:
+ err = trx->error_state;
+
+ ut_a(err != DB_SUCCESS);
+
+ trx->error_state = DB_SUCCESS;
+
+ if (err == DB_DUPLICATE_KEY) {
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, TRUE, savept);
+ }
+ } else if (err == DB_TOO_BIG_RECORD) {
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, TRUE, savept);
+ }
+ } else if (err == DB_LOCK_WAIT) {
+
+ timeout_expired = srv_suspend_mysql_thread(thr);
+
+ if (timeout_expired) {
+ trx->error_state = DB_DEADLOCK;
+
+ que_thr_stop_for_mysql(thr);
+
+ goto handle_new_error;
+ }
+
+ *new_err = err;
+
+ return(TRUE);
+
+ } else if (err == DB_DEADLOCK) {
+
+ /* Roll back the whole transaction */
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+
+ /* Roll back the whole transaction */
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ } else if (err == DB_MUST_GET_MORE_FILE_SPACE) {
+
+ ut_a(0); /* TODO: print something to MySQL error log */
+ } else {
+ ut_a(0);
+ }
+
+ if (trx->error_state != DB_SUCCESS) {
+ *new_err = trx->error_state;
+ } else {
+ *new_err = err;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ return(FALSE);
+}
+
+/************************************************************************
+Create a prebuilt struct for a MySQL table handle. */
+
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ /* out, own: a prebuilt struct */
+ dict_table_t* table) /* in: Innobase table handle */
+{
+ row_prebuilt_t* prebuilt;
+ mem_heap_t* heap;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ ulint ref_len;
+ ulint i;
+
+ heap = mem_heap_create(128);
+
+ prebuilt = mem_heap_alloc(heap, sizeof(row_prebuilt_t));
+
+ prebuilt->table = table;
+
+ prebuilt->trx = NULL;
+
+ prebuilt->sql_stat_start = TRUE;
+
+ prebuilt->index = NULL;
+ prebuilt->n_template = 0;
+ prebuilt->mysql_template = NULL;
+
+ prebuilt->heap = heap;
+ prebuilt->ins_node = NULL;
+
+ prebuilt->ins_upd_rec_buff = NULL;
+
+ prebuilt->upd_node = NULL;
+ prebuilt->ins_graph = NULL;
+ prebuilt->upd_graph = NULL;
+
+ prebuilt->pcur = btr_pcur_create_for_mysql();
+ prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+ prebuilt->select_lock_type = LOCK_NONE;
+
+ prebuilt->sel_graph = NULL;
+
+ prebuilt->search_tuple = dtuple_create(heap,
+ dict_table_get_n_cols(table));
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ prebuilt->clust_ref = ref;
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ prebuilt->fetch_cache[i] = NULL;
+ }
+
+ prebuilt->n_fetch_cached = 0;
+
+ prebuilt->blob_heap = NULL;
+
+ prebuilt->old_vers_heap = NULL;
+
+ return(prebuilt);
+}
+
+/************************************************************************
+Free a prebuilt struct for a MySQL table handle. */
+
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt) /* in, own: prebuilt struct */
+{
+ ulint i;
+
+ btr_pcur_free_for_mysql(prebuilt->pcur);
+ btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+ if (prebuilt->mysql_template) {
+ mem_free(prebuilt->mysql_template);
+ }
+
+ if (prebuilt->ins_graph) {
+ que_graph_free_recursive(prebuilt->ins_graph);
+ }
+
+ if (prebuilt->sel_graph) {
+ que_graph_free_recursive(prebuilt->sel_graph);
+ }
+
+ if (prebuilt->upd_graph) {
+ que_graph_free_recursive(prebuilt->upd_graph);
+ }
+
+ if (prebuilt->blob_heap) {
+ mem_heap_free(prebuilt->blob_heap);
+ }
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_free(prebuilt->old_vers_heap);
+ }
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ if (prebuilt->fetch_cache[i] != NULL) {
+ mem_free(prebuilt->fetch_cache[i]);
+ }
+ }
+
+ mem_heap_free(prebuilt->heap);
+}
+
+/*************************************************************************
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+
+void
+row_update_prebuilt_trx(
+/*====================*/
+ /* out: prebuilt dtuple */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL
+ handle */
+ trx_t* trx) /* in: transaction handle */
+{
+ prebuilt->trx = trx;
+
+ if (prebuilt->ins_graph) {
+ prebuilt->ins_graph->trx = trx;
+ }
+
+ if (prebuilt->upd_graph) {
+ prebuilt->upd_graph->trx = trx;
+ }
+
+ if (prebuilt->sel_graph) {
+ prebuilt->sel_graph->trx = trx;
+ }
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it. */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+ /* out: prebuilt dtuple */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ ins_node_t* node;
+ dtuple_t* row;
+ dict_table_t* table = prebuilt->table;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->ins_node == NULL) {
+
+ /* Not called before for this handle: create an insert node
+ and query graph to the prebuilt struct */
+
+ node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+ prebuilt->ins_node = node;
+
+ if (prebuilt->ins_upd_rec_buff == NULL) {
+ prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+ prebuilt->heap,
+ prebuilt->mysql_row_len);
+ }
+
+ row = dtuple_create(prebuilt->heap,
+ dict_table_get_n_cols(table));
+
+ dict_table_copy_types(row, table);
+
+ ins_node_set_new_row(node, row);
+
+ prebuilt->ins_graph =
+ que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+ prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->ins_node->row);
+}
+
+/*************************************************************************
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct */
+{
+ ulint counter;
+ ulint old_counter;
+
+ counter = prebuilt->table->stat_modif_counter;
+
+ counter += prebuilt->mysql_row_len;
+ prebuilt->table->stat_modif_counter = counter;
+
+ old_counter = prebuilt->table->stat_last_estimate_counter;
+
+ if (counter - old_counter >= DICT_STAT_CALCULATE_INTERVAL
+ || counter - old_counter >=
+ (UNIV_PAGE_SIZE
+ * prebuilt->table->stat_clustered_index_size / 2)) {
+
+ dict_update_statistics(prebuilt->table);
+ }
+}
+
+/*************************************************************************
+Does an insert for MySQL. */
+
+int
+row_insert_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ byte* mysql_rec, /* in: row in the MySQL format */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ trx_savept_t savept;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (node == NULL) {
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+ }
+
+ row_mysql_convert_row_to_innobase(node->row,
+ prebuilt->ins_upd_rec_buff,
+ prebuilt, mysql_rec);
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ if (prebuilt->sql_stat_start) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ node->state = INS_NODE_ALLOC_ROW_ID;
+ }
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_ins_step(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+ &savept);
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ return(err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ prebuilt->table->stat_n_rows++;
+
+ if (prebuilt->table->stat_n_rows == 0) {
+ /* Avoid wrap-over */
+ prebuilt->table->stat_n_rows--;
+ }
+
+ row_update_statistics_if_needed(prebuilt);
+
+ return((int) err);
+}
+
+/*************************************************************************
+Builds a dummy query graph used in selects. */
+
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ sel_node_t* node;
+
+ ut_ad(prebuilt && prebuilt->trx);
+
+ if (prebuilt->sel_graph == NULL) {
+
+ node = sel_node_create(prebuilt->heap);
+
+ prebuilt->sel_graph =
+ que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+
+ prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+ }
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it. */
+
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ /* out: prebuilt update vector */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ dict_table_t* table = prebuilt->table;
+ upd_node_t* node;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->upd_node == NULL) {
+
+ /* Not called before for this handle: create an update node
+ and query graph to the prebuilt struct */
+
+ node = upd_node_create(prebuilt->heap);
+
+ prebuilt->upd_node = node;
+
+ node->in_mysql_interface = TRUE;
+ node->is_delete = FALSE;
+ node->searched_update = FALSE;
+ node->select_will_do_update = FALSE;
+ node->select = NULL;
+ node->pcur = btr_pcur_create_for_mysql();
+ node->table = table;
+
+ node->update = upd_create(dict_table_get_n_cols(table),
+ prebuilt->heap);
+ UT_LIST_INIT(node->columns);
+ node->has_clust_rec_x_lock = TRUE;
+ node->cmpl_info = 0;
+
+ node->table_sym = NULL;
+ node->col_assign_list = NULL;
+
+ prebuilt->upd_graph =
+ que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+ prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->upd_node->update);
+}
+
+/*************************************************************************
+Does an update or delete of a row for MySQL. */
+
+int
+row_update_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ byte* mysql_rec, /* in: the row to be updated, in
+ the MySQL format */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ trx_savept_t savept;
+ ulint err;
+ que_thr_t* thr;
+ ibool was_lock_wait;
+ dict_index_t* clust_index;
+ ulint ref_len;
+ upd_node_t* node;
+ dict_table_t* table = prebuilt->table;
+ trx_t* trx = prebuilt->trx;
+ mem_heap_t* heap;
+ dtuple_t* search_tuple;
+ dtuple_t* row_tuple;
+ mtr_t mtr;
+
+ ut_ad(prebuilt && trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ node = prebuilt->upd_node;
+
+ clust_index = dict_table_get_first_index(table);
+
+ if (prebuilt->in_update_remember_pos) {
+ if (prebuilt->index == clust_index) {
+ btr_pcur_copy_stored_position(node->pcur,
+ prebuilt->pcur);
+ } else {
+ btr_pcur_copy_stored_position(node->pcur,
+ prebuilt->clust_pcur);
+ }
+
+ ut_ad(node->pcur->rel_pos == BTR_PCUR_ON);
+
+ goto skip_cursor_search;
+ }
+
+ /* We have to search for the correct cursor position */
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ heap = mem_heap_create(450);
+
+ row_tuple = dtuple_create(heap, dict_table_get_n_cols(table));
+ dict_table_copy_types(row_tuple, table);
+
+ if (prebuilt->ins_upd_rec_buff == NULL) {
+ prebuilt->ins_upd_rec_buff = mem_heap_alloc(prebuilt->heap,
+ prebuilt->mysql_row_len);
+ }
+
+ row_mysql_convert_row_to_innobase(row_tuple,
+ prebuilt->ins_upd_rec_buff,
+ prebuilt, mysql_rec);
+
+ search_tuple = dtuple_create(heap, ref_len);
+
+ row_build_row_ref_from_row(search_tuple, table, row_tuple);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_with_no_init(clust_index, search_tuple, PAGE_CUR_LE,
+ BTR_SEARCH_LEAF, node->pcur, 0, &mtr);
+
+ btr_pcur_store_position(node->pcur, &mtr);
+
+ mtr_commit(&mtr);
+
+ mem_heap_free(heap);
+
+skip_cursor_search:
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ ut_ad(!prebuilt->sql_stat_start);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ if (err == DB_RECORD_NOT_FOUND) {
+ trx->error_state = DB_SUCCESS;
+
+ return((int) err);
+ }
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+ &savept);
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ return(err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ if (prebuilt->upd_node->is_delete) {
+ if (prebuilt->table->stat_n_rows > 0) {
+ prebuilt->table->stat_n_rows--;
+ }
+ }
+
+ row_update_statistics_if_needed(prebuilt);
+
+ return((int) err);
+}
+
+/*************************************************************************
+Checks if a table is such that we automatically created a clustered
+index on it (on row id). */
+
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+ dict_table_t* table)
+{
+ dict_index_t* clust_index;
+
+ clust_index = dict_table_get_first_index(table);
+
+ if (dtype_get_mtype(dict_index_get_nth_type(clust_index, 0))
+ == DATA_SYS) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Does a table creation operation for MySQL. */
+
+int
+row_create_table_for_mysql(
+/*=======================*/
+ /* out: error code or DB_SUCCESS */
+ dict_table_t* table, /* in: table definition */
+ trx_t* trx) /* in: transaction handle */
+{
+ tab_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ ulint err;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ mutex_enter(&(dict_sys->mutex));
+
+ heap = mem_heap_create(512);
+
+ trx->dict_operation = TRUE;
+
+ node = tab_create_graph_create(table, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr),
+ SESS_COMM_EXECUTE, 0));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+ ut_a(err == DB_OUT_OF_FILE_SPACE);
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ row_drop_table_for_mysql(table->name, trx, TRUE);
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ return((int) err);
+}
+
+/*************************************************************************
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table. */
+
+int
+row_create_index_for_mysql(
+/*=======================*/
+ /* out: error number or DB_SUCCESS */
+ dict_index_t* index, /* in: index defintion */
+ trx_t* trx) /* in: transaction handle */
+{
+ ind_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ ulint err;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ mutex_enter(&(dict_sys->mutex));
+
+ heap = mem_heap_create(512);
+
+ trx->dict_operation = TRUE;
+
+ node = ind_create_graph_create(index, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr),
+ SESS_COMM_EXECUTE, 0));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+ ut_a(err == DB_OUT_OF_FILE_SPACE);
+
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+ row_drop_table_for_mysql(index->table_name, trx, TRUE);
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ return((int) err);
+}
+
+/*************************************************************************
+Drops a table for MySQL. */
+
+int
+row_drop_table_for_mysql(
+/*=====================*/
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx, /* in: transaction handle */
+ ibool has_dict_mutex) /* in: TRUE if the caller already owns the
+ dictionary system mutex */
+{
+ dict_table_t* table;
+ que_thr_t* thr;
+ que_t* graph;
+ ulint err;
+ char* str1;
+ char* str2;
+ ulint len;
+ char buf[10000];
+retry:
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_a(name != NULL);
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in deleting the dictionary data from system
+ tables in Innobase. Deleting a row from SYS_INDEXES table also
+ frees the file segments of the B-tree associated with the index. */
+
+ str1 =
+ "PROCEDURE DROP_TABLE_PROC () IS\n"
+ "table_id CHAR;\n"
+ "index_id CHAR;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "SELECT ID INTO table_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME ='";
+
+ str2 =
+ "';\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " COMMIT WORK;\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "found := 1;\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO index_id\n"
+ " FROM SYS_INDEXES\n"
+ " WHERE TABLE_ID = table_id;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n"
+ " DELETE FROM SYS_INDEXES WHERE ID = index_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n"
+ "DELETE FROM SYS_TABLES WHERE ID = table_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n";
+
+ len = ut_strlen(str1);
+
+ ut_memcpy(buf, str1, len);
+ ut_memcpy(buf + len, name, ut_strlen(name));
+
+ len += ut_strlen(name);
+
+ ut_memcpy(buf + len, str2, ut_strlen(str2) + 1);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ if (!has_dict_mutex) {
+ mutex_enter(&(dict_sys->mutex));
+ }
+
+ graph = pars_sql(buf);
+
+ ut_a(graph);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ /* Prevent purge from running while we are dropping the table */
+ rw_lock_s_lock(&(purge_sys->purge_is_running));
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ /* Check if there are any locks on the table: if yes, it cannot
+ be dropped: we have to wait for the locks to be released */
+
+ if (lock_is_on_table(table)) {
+
+ err = DB_TABLE_IS_BEING_USED;
+
+ goto funct_exit;
+ }
+
+ /* TODO: check that MySQL prevents users from accessing the table
+ after this function row_drop_table_for_mysql has been called:
+ otherwise anyone with an open handle to the table could, for example,
+ come to read the table! */
+
+ trx->dict_operation = TRUE;
+ trx->table_id = table->id;
+
+ ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ ut_a(err == DB_OUT_OF_FILE_SPACE);
+
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+
+ row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ ut_a(0);
+ } else {
+ dict_table_remove_from_cache(table);
+ }
+funct_exit:
+ rw_lock_s_unlock(&(purge_sys->purge_is_running));
+
+ if (!has_dict_mutex) {
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ que_graph_free(graph);
+
+ if (err == DB_TABLE_IS_BEING_USED) {
+ os_thread_sleep(200000);
+
+ goto retry;
+ }
+
+ return((int) err);
+}
+
+/*************************************************************************
+Renames a table for MySQL. */
+
+int
+row_rename_table_for_mysql(
+/*=======================*/
+ /* out: error code or DB_SUCCESS */
+ char* old_name, /* in: old table name */
+ char* new_name, /* in: new table name */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_table_t* table;
+ que_thr_t* thr;
+ que_t* graph;
+ ulint err;
+ char* str1;
+ char* str2;
+ char* str3;
+ ulint len;
+ char buf[10000];
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_a(old_name != NULL);
+ ut_a(new_name != NULL);
+
+ str1 =
+ "PROCEDURE RENAME_TABLE_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET NAME ='";
+
+ str2 =
+ "' WHERE NAME = '";
+
+ str3 =
+ "';\n"
+ "COMMIT WORK;\n"
+ "END;\n";
+
+ len = ut_strlen(str1);
+
+ ut_memcpy(buf, str1, len);
+
+ ut_memcpy(buf + len, new_name, ut_strlen(new_name));
+
+ len += ut_strlen(new_name);
+
+ ut_memcpy(buf + len, str2, ut_strlen(str2));
+
+ len += ut_strlen(str2);
+
+ ut_memcpy(buf + len, old_name, ut_strlen(old_name));
+
+ len += ut_strlen(old_name);
+
+ ut_memcpy(buf + len, str3, ut_strlen(str3) + 1);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table = dict_table_get_low(old_name);
+
+ graph = pars_sql(buf);
+
+ ut_a(graph);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ row_mysql_handle_errors(&err, trx, thr, NULL);
+ } else {
+ ut_a(dict_table_rename_in_cache(table, new_name));
+ }
+funct_exit:
+ mutex_exit(&(dict_sys->mutex));
+
+ que_graph_free(graph);
+
+ return((int) err);
+}
diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c
new file mode 100644
index 00000000000..0a6fabe584c
--- /dev/null
+++ b/innobase/row/row0purge.c
@@ -0,0 +1,553 @@
+/******************************************************
+Purge obsolete records
+
+(c) 1997 Innobase Oy
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "log0log.h"
+
+/************************************************************************
+Creates a purge node to a query graph. */
+
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+ /* out, own: purge node */
+ que_thr_t* parent, /* in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /* in: memory heap where created */
+{
+ purge_node_t* node;
+
+ ut_ad(parent && heap);
+
+ node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+ node->common.type = QUE_NODE_PURGE;
+ node->common.parent = parent;
+
+ node->heap = mem_heap_create(256);
+
+ return(node);
+}
+
+/***************************************************************
+Repositions the pcur in the purge node on the clustered index record,
+if found. */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+ /* out: TRUE if the record was found */
+ ulint mode, /* in: latching mode */
+ purge_node_t* node, /* in: row purge node */
+ mtr_t* mtr) /* in: mtr */
+{
+ ibool found;
+
+ if (node->found_clust) {
+ found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+ return(found);
+ }
+
+ found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+ node->ref, mtr);
+ node->found_clust = found;
+
+ if (found) {
+ btr_pcur_store_position(&(node->pcur), mtr);
+ }
+
+ return(found);
+}
+
+/***************************************************************
+Removes a delete marked clustered index record if possible. */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+ /* out: TRUE if success, or if not found, or
+ if modified after the delete marking */
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr, /* in: query thread */
+ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ulint err;
+ mtr_t mtr;
+
+ UT_NOT_USED(thr);
+
+ index = dict_table_get_first_index(node->table);
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ mtr_start(&mtr);
+
+ success = row_purge_reposition_pcur(mode, node, &mtr);
+
+ if (!success) {
+ /* The record is already removed */
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(TRUE);
+ }
+
+ if (0 != ut_dulint_cmp(node->roll_ptr,
+ row_get_rec_roll_ptr(btr_pcur_get_rec(pcur), index))) {
+
+ /* Someone else has modified the record later: do not remove */
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(TRUE);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+ if (err == DB_SUCCESS) {
+ success = TRUE;
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+ success = FALSE;
+ } else {
+ ut_a(0);
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(success);
+}
+
+/***************************************************************
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+/* printf("Purge: Removing clustered record\n"); */
+
+ success = row_purge_remove_clust_if_poss_low(node, thr,
+ BTR_MODIFY_LEAF);
+ if (success) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_clust_if_poss_low(node, thr,
+ BTR_MODIFY_TREE);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible. */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+ /* out: TRUE if success or if not found */
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode) /* in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has;
+ ibool found;
+ ulint err;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+
+ UT_NOT_USED(thr);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ if (!found) {
+ /* Not found */
+
+ /* FIXME: printf("PURGE:........sec entry not found\n"); */
+ /* dtuple_print(entry); */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ /* We should remove the index record if no later version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should do nothing. */
+
+ mtr_start(&mtr_vers);
+
+ success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers);
+
+ if (success) {
+ old_has = row_vers_old_has_index_entry(TRUE,
+ btr_pcur_get_rec(&(node->pcur)),
+ &mtr_vers, index, entry);
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+ if (!success || !old_has) {
+ /* Remove the index record */
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+ if (err == DB_SUCCESS) {
+ success = TRUE;
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+ success = FALSE;
+ } else {
+ ut_a(0);
+ }
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+/* printf("Purge: Removing secondary record\n"); */
+
+ success = row_purge_remove_sec_if_poss_low(node, thr, index, entry,
+ BTR_MODIFY_LEAF);
+ if (success) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_sec_if_poss_low(node, thr, index, entry,
+ BTR_MODIFY_TREE);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***************************************************************
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+
+ ut_ad(node && thr);
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ /* Build the index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_purge_remove_sec_if_poss(node, thr, index, entry);
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ row_purge_remove_clust_if_poss(node, thr);
+}
+
+/***************************************************************
+Purges an update of an existing record. */
+static
+void
+row_purge_upd_exist(
+/*================*/
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+
+ ut_ad(node && thr);
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ if (row_upd_changes_ord_field(NULL, node->index,
+ node->update)) {
+ /* Build the older version of the index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_purge_remove_sec_if_poss(node, thr, index, entry);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+ /* out: TRUE if purge operation required */
+ purge_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ dulint undo_no;
+ dulint table_id;
+ dulint trx_id;
+ dulint roll_ptr;
+ ulint info_bits;
+ ulint type;
+ ulint cmpl_info;
+
+ ut_ad(node && thr);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ &undo_no, &table_id);
+ node->rec_type = type;
+
+ if (type == TRX_UNDO_UPD_DEL_REC) {
+
+ return(FALSE);
+ }
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+ node->table = NULL;
+
+ if (type == TRX_UNDO_UPD_EXIST_REC
+ && cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+ /* Purge requires no changes to indexes: we may return */
+
+ return(FALSE);
+ }
+
+ /* NOTE that the table has to be explicitly released later */
+
+ /* TODO: currently nothing prevents dropping of table when purge
+ is accessing it! */
+
+ mutex_enter(&(dict_sys->mutex));
+
+ node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr));
+
+ rw_lock_x_lock(&(purge_sys->purge_is_running));
+
+ mutex_exit(&(dict_sys->mutex));
+
+ if (node->table == NULL) {
+ /* The table has been dropped: no need to do purge */
+
+ rw_lock_x_unlock(&(purge_sys->purge_is_running));
+
+ return(FALSE);
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits, node->heap,
+ &(node->update));
+
+ /* Read to the partial row the fields that occur in indexes */
+
+ ptr = trx_undo_rec_get_partial_row(ptr, clust_index, &(node->row),
+ node->heap);
+ return(TRUE);
+}
+
+/***************************************************************
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_purge(
+/*======*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code */
+ purge_node_t* node, /* in: row purge node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dulint roll_ptr;
+ ibool purge_needed;
+
+ ut_ad(node && thr);
+
+ node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+ &(node->reservation),
+ node->heap);
+ if (!node->undo_rec) {
+ /* Purge completed for this query thread */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(DB_SUCCESS);
+ }
+
+ node->roll_ptr = roll_ptr;
+
+ if (node->undo_rec == &trx_purge_dummy_rec) {
+ purge_needed = FALSE;
+ } else {
+ purge_needed = row_purge_parse_undo_rec(node, thr);
+ }
+
+ if (purge_needed) {
+ node->found_clust = FALSE;
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+ row_purge_upd_exist(node, thr);
+ } else {
+ ut_ad(node->rec_type == TRX_UNDO_DEL_MARK_REC);
+ row_purge_del_mark(node, thr);
+ }
+
+ if (node->found_clust) {
+ btr_pcur_close(&(node->pcur));
+ }
+
+ rw_lock_x_unlock(&(purge_sys->purge_is_running));
+ }
+
+ /* Do some cleanup */
+ trx_purge_rec_release(node->reservation);
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph. */
+
+que_thr_t*
+row_purge_step(
+/*===========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ purge_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+ err = row_purge(node, thr);
+
+ ut_ad(err == DB_SUCCESS);
+
+ return(thr);
+}
diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c
new file mode 100644
index 00000000000..f85789fa0d6
--- /dev/null
+++ b/innobase/row/row0row.c
@@ -0,0 +1,652 @@
+/******************************************************
+General row routines
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+
+/*************************************************************************
+Reads the trx id or roll ptr field from a clustered index record: this function
+is slower than the specialized inline functions. */
+
+dulint
+row_get_rec_sys_field(
+/*==================*/
+ /* out: value of the field */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ rec_t* rec, /* in: record */
+ dict_index_t* index) /* in: clustered index */
+{
+ ulint pos;
+ byte* field;
+ ulint len;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ field = rec_get_nth_field(rec, pos, &len);
+
+ if (type == DATA_TRX_ID) {
+
+ return(trx_read_trx_id(field));
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+
+ return(trx_read_roll_ptr(field));
+ }
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field in a clustered index record: this function
+is slower than the specialized inline functions. */
+
+void
+row_set_rec_sys_field(
+/*==================*/
+ /* out: value of the field */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ dulint val) /* in: value to set */
+{
+ ulint pos;
+ byte* field;
+ ulint len;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ field = rec_get_nth_field(rec, pos, &len);
+
+ if (type == DATA_TRX_ID) {
+
+ trx_write_trx_id(field, val);
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+
+ trx_write_roll_ptr(field, val);
+ }
+}
+
+/*********************************************************************
+When an insert to a table is performed, this function builds the entry which
+has to be inserted to an index on the table. */
+
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ /* out: index entry which should be inserted */
+ dtuple_t* row, /* in: row which should be inserted to the
+ table */
+ dict_index_t* index, /* in: index on the table */
+ mem_heap_t* heap) /* in: memory heap from which the memory for
+ the index entry is allocated */
+{
+ dtuple_t* entry;
+ ulint entry_len;
+ dict_field_t* ind_field;
+ dfield_t* dfield;
+ dfield_t* dfield2;
+ dict_col_t* col;
+ ulint i;
+
+ ut_ad(row && index && heap);
+ ut_ad(dtuple_check_typed(row));
+
+ entry_len = dict_index_get_n_fields(index);
+ entry = dtuple_create(heap, entry_len);
+
+ if (index->type & DICT_UNIVERSAL) {
+ dtuple_set_n_fields_cmp(entry, entry_len);
+ } else {
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ }
+
+ for (i = 0; i < entry_len; i++) {
+ ind_field = dict_index_get_nth_field(index, i);
+ col = ind_field->col;
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_copy(dfield, dfield2);
+ dfield->col_no = dict_col_get_no(col);
+ }
+
+ ut_ad(dtuple_check_typed(entry));
+
+ return(entry);
+}
+
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+dtuple_t*
+row_build(
+/*======*/
+ /* out, own: row built; see the NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap as the latter only places pointers to
+ data fields on the index page, and thus is
+ more efficient */
+ dict_index_t* index, /* in: clustered index */
+ rec_t* rec, /* in: record in the clustered index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row dtuple is used! */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ ulint n_fields;
+ ulint i;
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint row_len;
+ dict_col_t* col;
+ byte* buf;
+
+ ut_ad(index && rec && heap);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+ buf = mem_heap_alloc(heap, rec_get_size(rec));
+ rec = rec_copy(buf, rec);
+ }
+
+ table = index->table;
+ row_len = dict_table_get_n_cols(table);
+
+ row = dtuple_create(heap, row_len);
+
+ dtuple_set_info_bits(row, rec_get_info_bits(rec));
+
+ n_fields = dict_index_get_n_fields(index);
+
+ ut_ad(n_fields == rec_get_n_fields(rec));
+
+ dict_table_copy_types(row, table);
+
+ for (i = 0; i < n_fields; i++) {
+
+ col = dict_field_get_col(dict_index_get_nth_field(index, i));
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ field = rec_get_nth_field(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ ut_ad(dtuple_check_typed(row));
+
+ return(row);
+}
+
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+void
+row_build_to_tuple(
+/*===============*/
+ dtuple_t* row, /* in/out: row built; see the NOTE below! */
+ dict_index_t* index, /* in: clustered index */
+ rec_t* rec) /* in: record in the clustered index;
+ NOTE: the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row dtuple is used! */
+{
+ dict_table_t* table;
+ ulint n_fields;
+ ulint i;
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint row_len;
+ dict_col_t* col;
+
+ ut_ad(index && rec);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ table = index->table;
+ row_len = dict_table_get_n_cols(table);
+
+ dtuple_set_info_bits(row, rec_get_info_bits(rec));
+
+ n_fields = dict_index_get_n_fields(index);
+
+ ut_ad(n_fields == rec_get_n_fields(rec));
+
+ dict_table_copy_types(row, table);
+
+ for (i = 0; i < n_fields; i++) {
+
+ col = dict_field_get_col(dict_index_get_nth_field(index, i));
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ field = rec_get_nth_field(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ ut_ad(dtuple_check_typed(row));
+}
+
+/***********************************************************************
+Converts an index record to a typed data tuple. */
+
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ /* out, own: index entry built; see the
+ NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap as the latter only places pointers to
+ data fields on the index page */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the dtuple is used! */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dtuple_t* entry;
+ dfield_t* dfield;
+ ulint i;
+ byte* field;
+ ulint len;
+ ulint rec_len;
+ byte* buf;
+
+ ut_ad(rec && heap && index);
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+ buf = mem_heap_alloc(heap, rec_get_size(rec));
+ rec = rec_copy(buf, rec);
+ }
+
+ rec_len = rec_get_n_fields(rec);
+
+ entry = dtuple_create(heap, rec_len);
+
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ ut_ad(rec_len == dict_index_get_n_fields(index));
+
+ dict_index_copy_types(entry, index, rec_len);
+
+ dtuple_set_info_bits(entry, rec_get_info_bits(rec));
+
+ for (i = 0; i < rec_len; i++) {
+
+ dfield = dtuple_get_nth_field(entry, i);
+ field = rec_get_nth_field(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ ut_ad(dtuple_check_typed(entry));
+
+ return(entry);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ /* out, own: row reference built; see the
+ NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /* in: memory heap from which the memory
+ needed is allocated */
+{
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dict_col_t* col;
+ dtuple_t* ref;
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ byte* buf;
+ ulint i;
+
+ ut_ad(index && rec && heap);
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+
+ buf = mem_heap_alloc(heap, rec_get_size(rec));
+
+ rec = rec_copy(buf, rec);
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ col = dict_field_get_col(
+ dict_index_get_nth_field(clust_index, i));
+ pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col));
+
+ if (pos != ULINT_UNDEFINED) {
+ field = rec_get_nth_field(rec, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+ } else {
+ ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER);
+ ut_ad(i == table->mix_len);
+
+ dfield_set_data(dfield,
+ mem_heap_alloc(heap, table->mix_id_len),
+ table->mix_id_len);
+ ut_memcpy(dfield_get_data(dfield), table->mix_id_buf,
+ table->mix_id_len);
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+
+ return(ref);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /* in/out: row reference built; see the
+ NOTE below! */
+ dict_index_t* index, /* in: index */
+ rec_t* rec) /* in: record in the index;
+ NOTE: the data fields in ref will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+{
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dict_col_t* col;
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ ulint i;
+
+ ut_ad(ref && index && rec);
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ col = dict_field_get_col(
+ dict_index_get_nth_field(clust_index, i));
+ pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col));
+
+ if (pos != ULINT_UNDEFINED) {
+ field = rec_get_nth_field(rec, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+ } else {
+ ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER);
+ ut_ad(i == table->mix_len);
+ ut_a(0);
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+}
+
+/***********************************************************************
+From a row build a row reference with which we can search the clustered
+index record. */
+
+void
+row_build_row_ref_from_row(
+/*=======================*/
+ dtuple_t* ref, /* in/out: row reference built; see the
+ NOTE below! ref must have the right number
+ of fields! */
+ dict_table_t* table, /* in: table */
+ dtuple_t* row) /* in: row
+ NOTE: the data fields in ref will point
+ directly into data of this row */
+{
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dfield_t* dfield2;
+ dict_col_t* col;
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(ref && table && row);
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ col = dict_field_get_col(
+ dict_index_get_nth_field(clust_index, i));
+
+ dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_copy(dfield, dfield2);
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+}
+
+/*******************************************************************
+Searches the clustered index record for a row, if we have the row reference. */
+
+ibool
+row_search_on_row_ref(
+/*==================*/
+ /* out: TRUE if found */
+ btr_pcur_t* pcur, /* in/out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ dict_table_t* table, /* in: table */
+ dtuple_t* ref, /* in: row reference */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint low_match;
+ rec_t* rec;
+ dict_index_t* index;
+ page_t* page;
+
+ ut_ad(dtuple_check_typed(ref));
+
+ index = dict_table_get_first_index(table);
+
+ btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+ page = buf_frame_align(rec);
+
+ if (rec == page_get_infimum_rec(page)) {
+
+ return(FALSE);
+ }
+
+ if (low_match != dtuple_get_n_fields(ref)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved. */
+
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ /* out: record or NULL, if no record found */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ rec_t* rec, /* in: record in a secondary index */
+ dict_index_t* index, /* in: secondary index */
+ dict_index_t** clust_index,/* out: clustered index */
+ mtr_t* mtr) /* in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* ref;
+ dict_table_t* table;
+ btr_pcur_t pcur;
+ ibool found;
+ rec_t* clust_rec;
+
+ ut_ad((index->type & DICT_CLUSTERED) == 0);
+
+ table = index->table;
+
+ heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+ found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+ clust_rec = btr_pcur_get_rec(&pcur);
+
+ mem_heap_free(heap);
+
+ btr_pcur_close(&pcur);
+
+ *clust_index = dict_table_get_first_index(table);
+
+ if (!found) {
+
+ return(NULL);
+ }
+
+ return(clust_rec);
+}
+
+/*******************************************************************
+Searches an index record. */
+
+ibool
+row_search_index_entry(
+/*===================*/
+ /* out: TRUE if found */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /* in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint n_fields;
+ ulint low_match;
+ page_t* page;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(entry));
+
+ btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+ page = buf_frame_align(rec);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ if (rec == page_get_infimum_rec(page)) {
+
+ return(FALSE);
+ }
+
+ if (low_match != n_fields) {
+ /* Not found */
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
new file mode 100644
index 00000000000..bd7af5743d8
--- /dev/null
+++ b/innobase/row/row0sel.c
@@ -0,0 +1,2732 @@
+/*******************************************************
+Select
+
+(c) 1997 Innobase Oy
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH 16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT 1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT 100
+
+/* Flags for search shortcut */
+#define SEL_FOUND 0
+#define SEL_EXHAUSTED 1
+#define SEL_RETRY 2
+
+/*************************************************************************
+Creates a select node struct. */
+
+sel_node_t*
+sel_node_create(
+/*============*/
+ /* out, own: select node struct */
+ mem_heap_t* heap) /* in: memory heap where created */
+{
+ sel_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(sel_node_t));
+ node->common.type = QUE_NODE_SELECT;
+ node->state = SEL_NODE_OPEN;
+
+ node->select_will_do_update = FALSE;
+ node->latch_mode = BTR_SEARCH_LEAF;
+
+ node->plans = NULL;
+
+ return(node);
+}
+
+/*************************************************************************
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node) /* in: select node struct */
+{
+ ulint i;
+ plan_t* plan;
+
+ if (node->plans != NULL) {
+ for (i = 0; i < node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(node, i);
+
+ btr_pcur_close(&(plan->pcur));
+ btr_pcur_close(&(plan->clust_pcur));
+
+ if (plan->old_vers_heap) {
+ mem_heap_free(plan->old_vers_heap);
+ }
+ }
+ }
+}
+
+/*************************************************************************
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+ sel_node_t* node) /* in: select node */
+{
+ que_node_t* exp;
+
+ exp = node->select_list;
+
+ while (exp) {
+ eval_exp(exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*************************************************************************
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+ sym_node_t* var, /* in: first variable in a list of variables */
+ sel_node_t* node) /* in: select node */
+{
+ que_node_t* exp;
+
+ if (var == NULL) {
+
+ return;
+ }
+
+ exp = node->select_list;
+
+ while (var) {
+ ut_ad(exp);
+
+ eval_node_copy_val(var->alias, exp);
+
+ exp = que_node_get_next(exp);
+ var = que_node_get_next(var);
+ }
+}
+
+/*************************************************************************
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+ sel_node_t* node) /* in: select node */
+{
+ func_node_t* func_node;
+
+ ut_ad(node->is_aggregate);
+
+ func_node = node->select_list;
+
+ while (func_node) {
+ eval_node_set_int_val(func_node, 0);
+
+ func_node = que_node_get_next(func_node);
+ }
+
+ node->aggregate_already_fetched = FALSE;
+}
+
+/*************************************************************************
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+ sel_node_t* node) /* in: select node */
+{
+ sym_node_t* var;
+
+ var = UT_LIST_GET_FIRST(node->copy_variables);
+
+ while (var) {
+ eval_node_copy_val(var, var->alias);
+
+ var->indirection = NULL;
+
+ var = UT_LIST_GET_NEXT(col_var_list, var);
+ }
+}
+
+/*************************************************************************
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+ dict_index_t* index, /* in: record index */
+ rec_t* rec, /* in: record in a clustered or non-clustered
+ index */
+ sym_node_t* column) /* in: first column in a column list, or
+ NULL */
+{
+ dfield_t* val;
+ ulint index_type;
+ ulint field_no;
+ byte* data;
+ ulint len;
+
+ if (index->type & DICT_CLUSTERED) {
+ index_type = SYM_CLUST_FIELD_NO;
+ } else {
+ index_type = SYM_SEC_FIELD_NO;
+ }
+
+ while (column) {
+ field_no = column->field_nos[index_type];
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ data = rec_get_nth_field(rec, field_no, &len);
+
+ if (column->copy_val) {
+ eval_node_copy_and_alloc_val(column, data,
+ len);
+ } else {
+ val = que_node_get_val(column);
+ dfield_set_data(val, data, len);
+ }
+ }
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*************************************************************************
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+ sym_node_t* column) /* in: symbol table node for a column */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+ column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+ * sizeof(sel_buf_t));
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = column->prefetch_buf + i;
+
+ sel_buf->data = NULL;
+
+ sel_buf->val_buf_size = 0;
+ }
+}
+
+/*************************************************************************
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = prefetch_buf + i;
+
+ if (sel_buf->val_buf_size > 0) {
+
+ mem_free(sel_buf->data);
+ }
+ }
+}
+
+/*************************************************************************
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+ plan_t* plan) /* in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint val_buf_size;
+
+ ut_ad(plan->n_rows_prefetched > 0);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ val = que_node_get_val(column);
+
+ if (!column->copy_val) {
+ /* We did not really push any value for the
+ column */
+
+ ut_ad(!column->prefetch_buf);
+ ut_ad(que_node_get_val_buf_size(column) == 0);
+#ifdef UNIV_DEBUG
+ dfield_set_data(val, NULL, 0);
+#endif
+ goto next_col;
+ }
+
+ ut_ad(column->prefetch_buf);
+
+ sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+ data = sel_buf->data;
+ len = sel_buf->len;
+ val_buf_size = sel_buf->val_buf_size;
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ sel_buf->data = dfield_get_data(val);
+ sel_buf->len = dfield_get_len(val);
+ sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+ dfield_set_data(val, data, len);
+ que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+
+ plan->n_rows_prefetched--;
+
+ plan->first_prefetched++;
+}
+
+/*************************************************************************
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+ plan_t* plan) /* in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint pos;
+ ulint val_buf_size;
+
+ if (plan->n_rows_prefetched == 0) {
+ pos = 0;
+ plan->first_prefetched = 0;
+ } else {
+ pos = plan->n_rows_prefetched;
+
+ /* We have the convention that pushing new rows starts only
+ after the prefetch stack has been emptied: */
+
+ ut_ad(plan->first_prefetched == 0);
+ }
+
+ plan->n_rows_prefetched++;
+
+ ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ if (!column->copy_val) {
+ /* There is no sense to push pointers to database
+ page fields when we do not keep latch on the page! */
+
+ goto next_col;
+ }
+
+ if (!column->prefetch_buf) {
+ /* Allocate a new prefetch buffer */
+
+ sel_col_prefetch_buf_alloc(column);
+ }
+
+ sel_buf = column->prefetch_buf + pos;
+
+ val = que_node_get_val(column);
+
+ data = dfield_get_data(val);
+ len = dfield_get_len(val);
+ val_buf_size = que_node_get_val_buf_size(column);
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ dfield_set_data(val, sel_buf->data, sel_buf->len);
+ que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+ sel_buf->data = data;
+ sel_buf->len = len;
+ sel_buf->val_buf_size = val_buf_size;
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+ /* out: DB_SUCCESS or error code */
+ read_view_t* read_view, /* in: read view */
+ plan_t* plan, /* in: plan node for table */
+ rec_t* rec, /* in: record in a clustered index */
+ rec_t** old_vers, /* out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint err;
+
+ if (plan->old_vers_heap) {
+ mem_heap_empty(plan->old_vers_heap);
+ } else {
+ plan->old_vers_heap = mem_heap_create(512);
+ }
+
+ err = row_vers_build_for_consistent_read(rec, mtr, plan->index,
+ read_view, plan->old_vers_heap,
+ old_vers);
+ return(err);
+}
+
+/*************************************************************************
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted. */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ /* out: TRUE if row passed the tests */
+ plan_t* plan) /* in: plan for the table; the column values must
+ already have been retrieved and the right sides of
+ comparisons evaluated */
+{
+ func_node_t* cond;
+
+ /* All conditions in end_conds are comparisons of a column to an
+ expression */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ /* Evaluate the left side of the comparison, i.e., get the
+ column value if there is an indirection */
+
+ eval_sym(cond->args);
+
+ /* Do the comparison */
+
+ if (!eval_cmp(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Tests the other conditions. */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+ /* out: TRUE if row passed the tests */
+ plan_t* plan) /* in: plan for the table; the column values must
+ already have been retrieved */
+{
+ func_node_t* cond;
+
+ cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+ while (cond) {
+ eval_exp(cond);
+
+ if (!eval_node_get_ibool_val(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+ /* out: DB_SUCCESS or error code */
+ sel_node_t* node, /* in: select_node */
+ plan_t* plan, /* in: plan node for table */
+ rec_t* rec, /* in: record in a non-clustered index */
+ que_thr_t* thr, /* in: query thread */
+ rec_t** out_rec,/* out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ mtr_t* mtr) /* in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err;
+
+ row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec);
+
+ index = dict_table_get_first_index(plan->table);
+
+ btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+ node->latch_mode, &(plan->clust_pcur),
+ 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+ ut_ad(page_rec_is_user_rec(clust_rec));
+
+ if (!node->read_view) {
+ /* Try to place a lock on the index record */
+
+ err = lock_clust_rec_read_check_and_lock(0, clust_rec, index,
+ node->row_lock_mode, thr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (!lock_clust_rec_cons_read_sees(clust_rec, index,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(node->read_view, plan,
+ clust_rec, &old_vers, mtr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ clust_rec = old_vers;
+
+ if (clust_rec == NULL) {
+ *out_rec = clust_rec;
+
+ return(DB_SUCCESS);
+ }
+ }
+ }
+
+ /* Fetch the columns needed in test conditions */
+
+ row_sel_fetch_columns(index, clust_rec,
+ UT_LIST_GET_FIRST(plan->columns));
+ *out_rec = clust_rec;
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************************
+Sets a lock on a record. */
+UNIV_INLINE
+ulint
+sel_set_rec_lock(
+/*=============*/
+ /* out: DB_SUCCESS or error code */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: index */
+ ulint mode, /* in: lock mode */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ if (index->type & DICT_CLUSTERED) {
+ err = lock_clust_rec_read_check_and_lock(0, rec, index, mode,
+ thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(0, rec, index, mode,
+ thr);
+ }
+
+ return(err);
+}
+
+/*************************************************************************
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+ sel_node_t* node, /* in: select node */
+ plan_t* plan, /* in: table plan */
+ ibool search_latch_locked,
+ /* in: TRUE if the thread currently
+ has the search latch locked in
+ s-mode */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_index_t* index;
+ func_node_t* cond;
+ que_node_t* exp;
+ ulint n_fields;
+ ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
+ ulint i;
+
+ if (search_latch_locked) {
+ has_search_latch = RW_S_LATCH;
+ }
+
+ index = plan->index;
+
+ /* Calculate the value of the search tuple: the exact match columns
+ get their expressions evaluated when we evaluate the right sides of
+ end_conds */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ eval_exp(que_node_get_next(cond->args));
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+
+ if (plan->n_exact_match < n_fields) {
+ /* There is a non-exact match field which must be
+ evaluated separately */
+
+ eval_exp(plan->tuple_exps[n_fields - 1]);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ exp = plan->tuple_exps[i];
+
+ dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+ que_node_get_val(exp));
+ }
+
+ /* Open pcur to the index */
+
+ btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+ node->latch_mode, &(plan->pcur),
+ has_search_latch, mtr);
+ } else {
+ /* Open the cursor to the start or the end of the index
+ (FALSE: no init) */
+
+ btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
+ &(plan->pcur), FALSE, mtr);
+ }
+
+ ut_ad(plan->n_rows_prefetched == 0);
+ ut_ad(plan->n_rows_fetched == 0);
+ ut_ad(plan->cursor_at_end == FALSE);
+
+ plan->pcur_is_open = TRUE;
+}
+
+/*************************************************************************
+Restores a stored pcur position to a table index. */
+UNIV_INLINE
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+ /* out: TRUE if the cursor should be moved to
+ the next record after we return from this
+ function (moved to the previous, in the case
+ of a descending cursor) without processing
+ again the current cursor record */
+ sel_node_t* node, /* in: select node */
+ plan_t* plan, /* in: table plan */
+ mtr_t* mtr) /* in: mtr */
+{
+ ibool equal_position;
+ ulint relative_position;
+
+ ut_ad(!plan->cursor_at_end);
+
+ relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+ equal_position = btr_pcur_restore_position(node->latch_mode,
+ &(plan->pcur), mtr);
+
+ /* If the cursor is traveling upwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+ yet on the successor of the page infimum;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ not yet processed the cursor record: no need to move the cursor to the
+ next record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we must move to the next record;
+ (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the next
+ record, else there is no need to move the cursor. */
+
+ if (plan->asc) {
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER);
+
+ return(FALSE);
+ }
+
+ /* If the cursor is traveling downwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+ the last record LESS than the successor of a page infimum; we have not
+ processed the cursor record: no need to move the cursor;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ processed the cursor record: we should move the cursor to the previous
+ record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we need not move to the previous
+ record; (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+ record, else there is no need to move the cursor. */
+
+ if (relative_position == BTR_PCUR_BEFORE) {
+
+ return(FALSE);
+ }
+
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(FALSE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER);
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+ plan_t* plan) /* in: plan */
+{
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+ plan->n_rows_fetched = 0;
+ plan->n_rows_prefetched = 0;
+}
+
+/*************************************************************************
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+ /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+ sel_node_t* node, /* in: select node for a consistent read */
+ plan_t* plan, /* in: plan for a unique search in clustered
+ index */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_index_t* index;
+ rec_t* rec;
+
+ index = plan->index;
+
+ ut_ad(node->read_view);
+ ut_ad(plan->unique_search);
+ ut_ad(!plan->must_get_clust);
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+
+ row_sel_open_pcur(node, plan, TRUE, mtr);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ return(SEL_RETRY);
+ }
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (index->type & DICT_CLUSTERED) {
+ if (!lock_clust_rec_cons_read_sees(rec, index,
+ node->read_view)) {
+ return(SEL_RETRY);
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) {
+
+ return(SEL_RETRY);
+ }
+
+ /* Test deleted flag. Fetch the columns needed in test conditions. */
+
+ row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns));
+
+ if (rec_get_deleted_flag(rec)) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ /* Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+ plan->n_rows_fetched++;
+
+ return(SEL_FOUND);
+}
+
+/*************************************************************************
+Performs a select step. */
+static
+ulint
+row_sel(
+/*====*/
+ /* out: DB_SUCCESS or error code */
+ sel_node_t* node, /* in: select node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* index;
+ plan_t* plan;
+ mtr_t mtr;
+ ibool moved;
+ rec_t* rec;
+ rec_t* old_vers;
+ rec_t* clust_rec;
+ ibool search_latch_locked;
+ ibool consistent_read;
+
+ /* The following flag becomes TRUE when we are doing a
+ consistent read from a non-clustered index and we must look
+ at the clustered index to find out the previous delete mark
+ state of the non-clustered record: */
+
+ ibool cons_read_requires_clust_rec = FALSE;
+ ulint cost_counter = 0;
+ ibool cursor_just_opened;
+ ibool must_go_to_next;
+ ibool leaf_contains_updates = FALSE;
+ /* TRUE if select_will_do_update is
+ TRUE and the current clustered index
+ leaf page has been updated during
+ the current mtr: mtr must be committed
+ at the same time as the leaf x-latch
+ is released */
+ ibool mtr_has_extra_clust_latch = FALSE;
+ /* TRUE if the search was made using
+ a non-clustered index, and we had to
+ access the clustered record: now &mtr
+ contains a clustered index latch, and
+ &mtr must be committed before we move
+ to the next non-clustered record */
+ ulint found_flag;
+ ulint err;
+
+ ut_ad(thr->run_node == node);
+
+ search_latch_locked = FALSE;
+
+ if (node->read_view) {
+ /* In consistent reads, we try to do with the hash index and
+ not to use the buffer page get. This is to reduce memory bus
+ load resulting from semaphore operations. The search latch
+ will be s-locked when we access an index with a unique search
+ condition, but not locked when we access an index with a
+ less selective search condition. */
+
+ consistent_read = TRUE;
+ } else {
+ consistent_read = FALSE;
+ }
+
+table_loop:
+ /* TABLE LOOP
+ ----------
+ This is the outer major loop in calculating a join. We come here when
+ node->fetch_table changes, and after adding a row to aggregate totals
+ and, of course, when this function is called. */
+
+ ut_ad(leaf_contains_updates == FALSE);
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ plan = sel_node_get_nth_plan(node, node->fetch_table);
+ index = plan->index;
+
+ if (plan->n_rows_prefetched > 0) {
+ sel_pop_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+ if (plan->cursor_at_end) {
+ /* The cursor has already reached the result set end: no more
+ rows to process for this table cursor, as also the prefetch
+ stack was empty */
+
+ ut_ad(plan->pcur_is_open);
+
+ goto table_exhausted_no_mtr;
+ }
+
+ /* Open a cursor to index, or restore an open cursor position */
+
+ mtr_start(&mtr);
+
+ if (consistent_read && plan->unique_search && !plan->pcur_is_open
+ && !plan->must_get_clust) {
+ if (!search_latch_locked) {
+ rw_lock_s_lock(&btr_search_latch);
+
+ search_latch_locked = TRUE;
+ } else if (btr_search_latch.writer_is_wait_ex) {
+
+ /* There is an x-latch request waiting: release the
+ s-latch for a moment; as an s-latch here is often
+ kept for some 10 searches before being released,
+ a waiting x-latch request would block other threads
+ from acquiring an s-latch for a long time, lowering
+ performance significantly in multiprocessors. */
+
+ rw_lock_s_unlock(&btr_search_latch);
+ rw_lock_s_lock(&btr_search_latch);
+ }
+
+ found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+ if (found_flag == SEL_FOUND) {
+
+ goto next_table;
+
+ } else if (found_flag == SEL_EXHAUSTED) {
+
+ goto table_exhausted;
+ }
+
+ ut_ad(found_flag == SEL_RETRY);
+
+ plan_reset_cursor(plan);
+
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ }
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+
+ search_latch_locked = FALSE;
+ }
+
+ if (!plan->pcur_is_open) {
+ /* Evaluate the expressions to build the search tuple and
+ open the cursor */
+
+ row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
+
+ cursor_just_opened = TRUE;
+
+ /* A new search was made: increment the cost counter */
+ cost_counter++;
+ } else {
+ /* Restore pcur position to the index */
+
+ must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
+
+ cursor_just_opened = FALSE;
+
+ if (must_go_to_next) {
+ /* We have already processed the cursor record: move
+ to the next */
+
+ goto next_rec;
+ }
+ }
+
+rec_loop:
+ /* RECORD LOOP
+ -----------
+ In this loop we use pcur and try to fetch a qualifying row, and
+ also fill the prefetch buffer for this table if n_rows_fetched has
+ exceeded a threshold. While we are inside this loop, the following
+ holds:
+ (1) &mtr is started,
+ (2) pcur is positioned and open.
+
+ NOTE that if cursor_just_opened is TRUE here, it means that we came
+ to this point right after row_sel_open_pcur. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ /* PHASE 1: Set a lock if specified */
+
+ if (!node->asc && cursor_just_opened
+ && (rec != page_get_supremum_rec(buf_frame_align(rec)))) {
+
+ /* When we open a cursor for a descending search, we must set
+ a next-key lock on the successor record: otherwise it would
+ be possible to insert new records next to the cursor position,
+ and it might be that these new records should appear in the
+ search result set, resulting in the phantom problem. */
+
+ if (!consistent_read) {
+ err = sel_set_rec_lock(page_rec_get_next(rec), index,
+ node->row_lock_mode, thr);
+ if (err != DB_SUCCESS) {
+ /* Note that in this case we will store in pcur
+ the PREDECESSOR of the record we are waiting
+ the lock for */
+
+ goto lock_wait_or_error;
+ }
+ }
+ }
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. We also increment the cost counter as we may have
+ processed yet another page of index. */
+
+ cost_counter++;
+
+ goto next_rec;
+ }
+
+ if (!consistent_read) {
+ /* Try to place a lock on the index record */
+
+ err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ /* A page supremum record cannot be in the result set: skip
+ it now when we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (cost_counter > SEL_COST_LIMIT) {
+
+ /* Now that we have placed the necessary locks, we can stop
+ for a while and store the cursor position; NOTE that if we
+ would store the cursor position BEFORE placing a record lock,
+ it might happen that the cursor would jump over some records
+ that another transaction could meanwhile insert adjacent to
+ the cursor: this would result in the phantom problem. */
+
+ goto stop_for_a_while;
+ }
+
+ /* PHASE 2: Check a mixed index mix id if needed */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search
+ with the mode PAGE_CUR_GE, the up_match field in the cursor
+ tells how many fields in the user record matched to the search
+ tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur))
+ < plan->n_exact_match) {
+ goto table_exhausted;
+ }
+
+ /* Ok, no need to test end_conds or mix id */
+
+ } else if (plan->mixed_index) {
+ /* We have to check if the record in a mixed cluster belongs
+ to this table */
+
+ if (!dict_is_mixed_table_rec(plan->table, rec)) {
+
+ goto next_rec;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ /* PHASE 3: Get previous version in a consistent read */
+
+ if (consistent_read) {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (index->type & DICT_CLUSTERED) {
+
+ if (!lock_clust_rec_cons_read_sees(rec, index,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(node->read_view,
+ plan, rec, &old_vers,
+ &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ row_sel_fetch_columns(index, rec,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, index,
+ node->read_view)) {
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+
+ /* PHASE 4: Test search end conditions and deleted flag */
+
+ /* Fetch the columns needed in test conditions */
+
+ row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the selection end conditions: these can only contain columns
+ which already are found in the index, even though the index might be
+ non-clustered */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ /* No test necessary: the test was already made above */
+
+ } else if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) {
+
+ /* The record is delete marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 5: Get the clustered index record, if needed and if we did
+ not do the search using the clustered index */
+
+ if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+ &mtr);
+ mtr_has_extra_clust_latch = TRUE;
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* Retrieving the clustered record required a search:
+ increment the cost counter */
+
+ cost_counter++;
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(consistent_read);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec)) {
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ if (node->can_get_updated) {
+
+ btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+ }
+ }
+
+ /* PHASE 6: Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 7: We found a new qualifying row for the current table; push
+ the row if prefetch is on, or move to the next table in the join */
+
+ plan->n_rows_fetched++;
+
+ ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+ if (node->select_will_do_update) {
+ /* This is a searched update and we can do the update in-place,
+ saving CPU time */
+
+ row_upd_in_place_in_select(node, thr, &mtr);
+
+ leaf_contains_updates = TRUE;
+
+ /* When the database is in the online backup mode, the number
+ of log records for a single mtr should be small: increment the
+ cost counter to ensure it */
+
+ cost_counter += 1 + (SEL_COST_LIMIT / 8);
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+ || plan->unique_search || plan->no_prefetch) {
+
+ /* No prefetch in operation: go to the next table */
+
+ goto next_table;
+ }
+
+ sel_push_prefetched_row(plan);
+
+ if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+ /* The prefetch buffer is now full */
+
+ sel_pop_prefetched_row(plan);
+
+ goto next_table;
+ }
+
+next_rec:
+ ut_ad(!search_latch_locked);
+
+ if (mtr_has_extra_clust_latch) {
+
+ /* We must commit &mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (leaf_contains_updates
+ && btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) {
+
+ /* We must commit &mtr if we are moving to a different page,
+ because we have done updates to the x-latched leaf page, and
+ the latch would be released in btr_pcur_move_to_next, without
+ &mtr getting committed there */
+
+ ut_ad(node->asc);
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (node->asc) {
+ moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+ }
+
+ if (!moved) {
+
+ goto table_exhausted;
+ }
+
+ cursor_just_opened = FALSE;
+
+ /* END OF RECORD LOOP
+ ------------------ */
+ goto rec_loop;
+
+next_table:
+ /* We found a record which satisfies the conditions: we can move to
+ the next table or return a row in the result set */
+
+ ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr));
+
+ if (plan->unique_search && !node->can_get_updated) {
+
+ plan->cursor_at_end = TRUE;
+ } else {
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ leaf_contains_updates = FALSE;
+ mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+ /* If we use 'goto' to this label, it means that the row was popped
+ from the prefetched rows stack, and &mtr is already committed */
+
+ if (node->fetch_table + 1 == node->n_tables) {
+
+ sel_eval_select_list(node);
+
+ if (node->is_aggregate) {
+
+ goto table_loop;
+ }
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ node->fetch_table++;
+
+ /* When we move to the next table, we first reset the plan cursor:
+ we do not care about resetting it when we backtrack from a table */
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+ goto table_loop;
+
+table_exhausted:
+ /* The table cursor pcur reached the result set end: backtrack to the
+ previous table in the join if we do not have cached prefetched rows */
+
+ plan->cursor_at_end = TRUE;
+
+ mtr_commit(&mtr);
+
+ leaf_contains_updates = FALSE;
+ mtr_has_extra_clust_latch = FALSE;
+
+ if (plan->n_rows_prefetched > 0) {
+ /* The table became exhausted during a prefetch */
+
+ sel_pop_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+table_exhausted_no_mtr:
+ if (node->fetch_table == 0) {
+
+ if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+ node->aggregate_already_fetched = TRUE;
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ node->state = SEL_NODE_NO_MORE_ROWS;
+
+ thr->run_node = que_node_get_parent(node);
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ node->fetch_table--;
+
+ goto table_loop;
+
+stop_for_a_while:
+ /* Return control for a while to que_run_threads, so that runaway
+ queries can be canceled. NOTE that when we come here, we must, in a
+ locking read, have placed the necessary (possibly waiting request)
+ record lock on the cursor record or its successor: when we reposition
+ the cursor, this record lock guarantees that nobody can meanwhile have
+ inserted new records which should have appeared in the result set,
+ which would result in the phantom problem. */
+
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+ return(DB_SUCCESS);
+
+commit_mtr_for_a_while:
+ /* Stores the cursor position and commits &mtr; this is used if
+ &mtr may contain latches which would break the latching order if
+ &mtr would not be committed and the latches released. */
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ ut_ad(!search_latch_locked);
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ leaf_contains_updates = FALSE;
+ mtr_has_extra_clust_latch = FALSE;
+
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+ goto table_loop;
+
+lock_wait_or_error:
+ /* See the note at stop_for_a_while: the same holds for this case */
+
+ ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr)
+ || !node->asc);
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+ return(err);
+}
+
+/**************************************************************************
+Performs a select step. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_sel_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint i_lock_mode;
+ sym_node_t* table_node;
+ sel_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+ /* If this is a new time this node is executed (or when execution
+ resumes after wait for a table intention lock), set intention locks
+ on the tables, or assign a read view */
+
+ if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+ node->state = SEL_NODE_OPEN;
+ }
+
+ if (node->state == SEL_NODE_OPEN) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(thr_get_trx(thr));
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+ if (node->consistent_read) {
+ /* Assign a read view for the query */
+ node->read_view = trx_assign_read_view(
+ thr_get_trx(thr));
+ } else {
+ if (node->set_x_locks) {
+ i_lock_mode = LOCK_IX;
+ } else {
+ i_lock_mode = LOCK_IS;
+ }
+
+ table_node = node->table_list;
+
+ while (table_node) {
+ err = lock_table(0, table_node->table,
+ i_lock_mode, thr);
+ if (err != DB_SUCCESS) {
+
+ que_thr_handle_error(thr, DB_ERROR,
+ NULL, 0);
+ return(NULL);
+ }
+
+ table_node = que_node_get_next(table_node);
+ }
+ }
+
+ /* If this is an explicit cursor, copy stored procedure
+ variable values, so that the values cannot change between
+ fetches (currently, we copy them also for non-explicit
+ cursors) */
+
+ if (node->explicit_cursor &&
+ UT_LIST_GET_FIRST(node->copy_variables)) {
+
+ row_sel_copy_input_variable_vals(node);
+ }
+
+ node->state = SEL_NODE_FETCH;
+ node->fetch_table = 0;
+
+ if (node->is_aggregate) {
+ /* Reset the aggregate total values */
+ sel_reset_aggregate_vals(node);
+ }
+ }
+
+ err = row_sel(node, thr);
+
+ /* NOTE! if queries are parallelized, the following assignment may
+ have problems; the assignment should be made only if thr is the
+ only top-level thr in the graph: */
+
+ thr->graph->last_sel_node = node;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+ printf("SQL error %lu\n", err);
+
+ que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+ return(NULL);
+ }
+
+ return(thr);
+}
+
+/**************************************************************************
+Performs a fetch for a cursor. */
+
+que_thr_t*
+fetch_step(
+/*=======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ sel_node_t* sel_node;
+ fetch_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ sel_node = node->cursor_def;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+ if (thr->prev_node != que_node_get_parent(node)) {
+
+ if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+ sel_assign_into_var_values(node->into_list, sel_node);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ /* Make the fetch node the parent of the cursor definition for
+ the time of the fetch, so that execution knows to return to this
+ fetch node after a row has been selected or we know that there is
+ no row left */
+
+ sel_node->common.parent = node;
+
+ if (sel_node->state == SEL_NODE_CLOSED) {
+ /* SQL error detected */
+ printf("SQL error %lu\n", DB_ERROR);
+
+ que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+ return(NULL);
+ }
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/***************************************************************
+Prints a row in a select result. */
+
+que_thr_t*
+row_printf_step(
+/*============*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ row_printf_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* arg;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ sel_node = node->sel_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+
+ if (sel_node->state != SEL_NODE_FETCH) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to print */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ arg = sel_node->select_list;
+
+ while (arg) {
+ dfield_print_also_hex(que_node_get_val(arg));
+
+ printf(" ::: ");
+
+ arg = que_node_get_next(arg);
+ }
+
+ printf("\n");
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/********************************************************************
+Converts a key value stored in MySQL format to an Innobase dtuple.
+The last field of the key value may be just a prefix of a fixed length
+field: hence the parameter key_len. */
+
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /* in: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /* in: buffer to use in field
+ conversions */
+ dict_index_t* index, /* in: index of the key value */
+ byte* key_ptr, /* in: MySQL key value */
+ ulint key_len) /* in: MySQL key value length */
+{
+ dfield_t* dfield;
+ ulint offset;
+ ulint len;
+ byte* key_end;
+ ulint n_fields = 0;
+
+ key_end = key_ptr + key_len;
+
+ /* Permit us to access any field in the tuple (ULINT_MAX): */
+
+ dtuple_set_n_fields(tuple, ULINT_MAX);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ if (dfield_get_type(dfield)->mtype == DATA_SYS) {
+ /* A special case: we are looking for a position in a
+ generated clustered index: the first and the only
+ ordering column is ROW_ID */
+
+ ut_a(key_len == DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+ dtuple_set_n_fields(tuple, 1);
+
+ return;
+ }
+
+ while (key_ptr < key_end) {
+ offset = 0;
+ len = dfield_get_type(dfield)->len;
+
+ n_fields++;
+
+ if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+ /* The first byte in the field tells if this is
+ an SQL NULL value */
+
+ offset = 1;
+
+ if (*key_ptr != 0) {
+ dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+ goto next_part;
+ }
+ }
+
+ row_mysql_store_col_in_innobase_format(
+ dfield, buf, key_ptr + offset, len,
+ dfield_get_type(dfield)->mtype,
+ dfield_get_type(dfield)->prtype
+ & DATA_UNSIGNED);
+ next_part:
+ key_ptr += (offset + len);
+
+ if (key_ptr > key_end) {
+ /* The last field in key was not a complete
+ field but a prefix of it */
+
+ ut_ad(dfield_get_len(dfield) != UNIV_SQL_NULL);
+
+ dfield_set_data(dfield, buf,
+ len - (ulint)(key_ptr - key_end));
+ }
+
+ buf += len;
+
+ dfield++;
+ }
+
+ /* We set the length of tuple to n_fields: we assume that
+ the memory area allocated for it is big enough (usually
+ bigger than n_fields). */
+
+ dtuple_set_n_fields(tuple, n_fields);
+}
+
+/******************************************************************
+Stores the row id to the prebuilt struct. */
+UNIV_INLINE
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+ row_prebuilt_t* prebuilt, /* in: prebuilt */
+ rec_t* index_rec, /* in: record */
+ dict_index_t* index) /* in: index of the record */
+{
+ byte* data;
+ ulint len;
+
+ data = rec_get_nth_field(index_rec,
+ dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+ ut_a(len == DATA_ROW_ID_LEN);
+
+ ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field in the MySQL format. */
+UNIV_INLINE
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+ byte* dest, /* in/out: buffer where to store; NOTE that BLOBs
+ are not in themselves stored here: the caller must
+ allocate and copy the BLOB into buffer before, and pass
+ the pointer to the BLOB in 'data' */
+ ulint col_len,/* in: MySQL column length */
+ byte* data, /* in: data to store */
+ ulint len, /* in: length of the data */
+ ulint type, /* in: data type */
+ ulint is_unsigned)/* in: != 0 if an unsigned integer type */
+{
+ byte* ptr;
+
+ ut_ad(len != UNIV_SQL_NULL);
+
+ if (type == DATA_INT) {
+ /* Convert integer data from Innobase to a little-endian
+ format, sign bit restored to normal */
+
+ ptr = dest + len;
+
+ for (;;) {
+ ptr--;
+ *ptr = *data;
+ if (ptr == dest) {
+ break;
+ }
+ data++;
+ }
+
+ if (!is_unsigned) {
+ dest[len - 1] = dest[len - 1] ^ 128;
+ }
+
+ ut_ad(col_len == len);
+ } else if (type == DATA_VARCHAR || type == DATA_VARMYSQL
+ || type == DATA_BINARY) {
+ /* Store the length of the data to the first two bytes of
+ dest; does not do anything yet because MySQL has
+ no real vars! */
+
+ dest = row_mysql_store_var_len(dest, len);
+ ut_memcpy(dest, data, len);
+
+ /* ut_ad(col_len >= len + 2); No real var implemented in
+ MySQL yet! */
+
+ } else if (type == DATA_BLOB) {
+ /* Store a pointer to the BLOB buffer to dest: the BLOB was
+ already copied to the buffer in row_sel_store_mysql_rec */
+
+ row_mysql_store_blob_ref(dest, col_len, data, len);
+ } else {
+ ut_memcpy(dest, data, len);
+ ut_ad(col_len == len);
+ }
+}
+
+/******************************************************************
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query. */
+static
+void
+row_sel_store_mysql_rec(
+/*====================*/
+ byte* mysql_rec, /* out: row in the MySQL format */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct */
+ rec_t* rec) /* in: Innobase record in the index
+ which was described in prebuilt's
+ template */
+{
+ mysql_row_templ_t* templ;
+ byte* data;
+ ulint len;
+ byte* blob_buf;
+ ulint i;
+
+ ut_ad(prebuilt->mysql_template);
+
+ if (prebuilt->blob_heap != NULL) {
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+ }
+
+ /* Mark all columns as not SQL NULL */
+
+ memset(mysql_rec, '\0', prebuilt->null_bitmap_len);
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+
+ data = rec_get_nth_field(rec, templ->rec_field_no, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (templ->type == DATA_BLOB) {
+
+ /* Copy the BLOB data to the BLOB
+ heap of prebuilt */
+
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap =
+ mem_heap_create(len);
+ }
+
+ blob_buf = mem_heap_alloc(prebuilt->blob_heap,
+ len);
+ ut_memcpy(blob_buf, data, len);
+
+ data = blob_buf;
+ }
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ->mysql_col_len, data, len,
+ templ->type, templ->is_unsigned);
+ } else {
+ mysql_rec[templ->mysql_null_byte_offset] |=
+ (byte) (templ->mysql_null_bit_mask);
+ }
+ }
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+ /* out: DB_SUCCESS or error code */
+ read_view_t* read_view, /* in: read view */
+ dict_index_t* clust_index, /* in: clustered index */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct */
+ rec_t* rec, /* in: record in a clustered index */
+ rec_t** old_vers, /* out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint err;
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(200);
+ }
+
+ err = row_vers_build_for_consistent_read(rec, mtr, clust_index,
+ read_view, prebuilt->old_vers_heap,
+ old_vers);
+ return(err);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface. */
+static
+ulint
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+ /* out: DB_SUCCESS or error code */
+ row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */
+ dict_index_t* sec_index,/* in: secondary index where rec resides */
+ rec_t* rec, /* in: record in a non-clustered index */
+ que_thr_t* thr, /* in: query thread */
+ rec_t** out_rec,/* out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ mtr_t* mtr) /* in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* clust_index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err;
+ trx_t* trx;
+
+ *out_rec = NULL;
+
+ row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec);
+
+ clust_index = dict_table_get_first_index(sec_index->table);
+
+ btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ prebuilt->clust_pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+ ut_ad(page_rec_is_user_rec(clust_rec));
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record */
+
+ err = lock_clust_rec_read_check_and_lock(0, clust_rec,
+ clust_index,
+ prebuilt->select_lock_type, thr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ trx = thr_get_trx(thr);
+
+ if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+ trx->read_view)) {
+
+ err = row_sel_build_prev_vers_for_mysql(
+ trx->read_view, clust_index,
+ prebuilt, clust_rec,
+ &old_vers, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ clust_rec = old_vers;
+ }
+ }
+
+ *out_rec = clust_rec;
+
+ if (prebuilt->select_lock_type == LOCK_X) {
+ /* We may use the cursor in update: store its position */
+
+ btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/************************************************************************
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on can have been deleted.
+Then we may have to move the cursor one step up or down. */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+ /* out: TRUE if we may need to
+ process the record the cursor is
+ now positioned on (i.e. we should
+ not go to the next record yet) */
+ ulint latch_mode, /* in: latch mode wished in
+ restoration */
+ btr_pcur_t* pcur, /* in: cursor whose position
+ has been stored */
+ ibool moves_up, /* in: TRUE if the cursor moves up
+ in the index */
+ mtr_t* mtr) /* in: mtr; CAUTION: may commit
+ mtr temporarily! */
+{
+ ibool success;
+ ulint relative_position;
+
+ relative_position = pcur->rel_pos;
+
+ success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+ if (relative_position == BTR_PCUR_ON) {
+ if (success) {
+ return(FALSE);
+ }
+
+ if (moves_up) {
+ btr_pcur_move_to_next(pcur, mtr);
+
+ return(TRUE);
+ }
+
+ return(TRUE);
+ }
+
+ if (relative_position == BTR_PCUR_AFTER) {
+ if (moves_up) {
+ return(TRUE);
+ }
+
+ if (btr_pcur_is_on_user_rec(pcur, mtr)) {
+ btr_pcur_move_to_prev(pcur, mtr);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_BEFORE);
+
+ if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return(TRUE);
+}
+
+/************************************************************************
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+ byte* buf, /* in/out: buffer where to copy the
+ row */
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct */
+{
+ ut_ad(prebuilt->n_fetch_cached > 0);
+
+ ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+ prebuilt->mysql_row_len);
+ prebuilt->n_fetch_cached--;
+ prebuilt->fetch_cache_first++;
+
+ if (prebuilt->n_fetch_cached == 0) {
+ prebuilt->fetch_cache_first = 0;
+ }
+}
+
+/************************************************************************
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct */
+ rec_t* rec) /* in: record to push */
+{
+ ulint i;
+
+ ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+ if (prebuilt->fetch_cache[0] == NULL) {
+ /* Allocate memory for the fetch cache */
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ prebuilt->fetch_cache[i] = mem_alloc(
+ prebuilt->mysql_row_len);
+ }
+ }
+
+ ut_ad(prebuilt->fetch_cache_first == 0);
+
+ row_sel_store_mysql_rec(
+ prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+ prebuilt, rec);
+
+ prebuilt->n_fetch_cached++;
+}
+
+/************************************************************************
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor! */
+
+ulint
+row_search_for_mysql(
+/*=================*/
+ /* out: DB_SUCCESS,
+ DB_RECORD_NOT_FOUND,
+ DB_END_OF_INDEX, or DB_DEADLOCK */
+ byte* buf, /* in/out: buffer for the fetched
+ row in the MySQL format */
+ ulint mode, /* in: search mode PAGE_CUR_L, ... */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint match_mode, /* in: 0 or ROW_SEL_EXACT or
+ ROW_SEL_EXACT_PREFIX */
+ ulint direction) /* in: 0 or ROW_SEL_NEXT or
+ ROW_SEL_PREV; NOTE: if this is != 0,
+ then prebuilt must have a pcur
+ with stored position! In opening of a
+ cursor 'direction' should be 0. */
+{
+ dict_index_t* index = prebuilt->index;
+ dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ dict_index_t* clust_index;
+ que_thr_t* thr;
+ rec_t* rec;
+ rec_t* index_rec;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err;
+ ibool moved;
+ ibool cons_read_requires_clust_rec;
+ ibool was_lock_wait;
+ ulint ret;
+ ibool unique_search_from_clust_index = FALSE;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ ibool moves_up = FALSE;
+ mtr_t mtr;
+
+ ut_ad(index && pcur && search_tuple);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ ut_ad(sync_thread_levels_empty_gen(FALSE));
+
+ if (direction == 0) {
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+ } else {
+ if (prebuilt->n_rows_fetched == 0) {
+ prebuilt->fetch_direction = direction;
+ }
+
+ if (direction != prebuilt->fetch_direction) {
+ if (prebuilt->n_fetch_cached > 0) {
+ ut_a(0);
+ /* TODO: scrollable cursor: restore cursor to
+ the place of the latest returned row,
+ or better: prevent caching for a scroll
+ cursor! */
+ }
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ } else if (prebuilt->n_fetch_cached > 0) {
+ row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+ prebuilt->n_rows_fetched++;
+
+ return(DB_SUCCESS);
+ }
+
+ if (prebuilt->fetch_cache_first > 0
+ && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+ /* The previous returned row was popped from the fetch
+ cache, but the cache was not full at the time of the
+ popping: no more rows can exist in the result set */
+
+ return(DB_RECORD_NOT_FOUND);
+ }
+
+ prebuilt->n_rows_fetched++;
+
+ if (prebuilt->n_rows_fetched > 1000000000) {
+ /* Prevent wrap-over */
+ prebuilt->n_rows_fetched = 500000000;
+ }
+
+ mode = pcur->search_mode;
+ }
+
+ if (match_mode == ROW_SEL_EXACT && index->type & DICT_UNIQUE
+ && index->type & DICT_CLUSTERED
+ && dtuple_get_n_fields(search_tuple)
+ == dict_index_get_n_unique(index)) {
+
+ if (direction == ROW_SEL_NEXT) {
+ /* MySQL sometimes seems to do fetch next even
+ if the search condition is unique; we do not store
+ pcur position in this case, so we cannot
+ restore cursor position, and must return
+ immediately */
+
+ return(DB_RECORD_NOT_FOUND);
+ }
+
+ ut_a(direction == 0); /* We cannot do fetch prev, as we have
+ not stored the cursor position */
+ mode = PAGE_CUR_GE;
+
+ unique_search_from_clust_index = TRUE;
+ }
+
+ /* Note that if the search mode was GE or G, then the cursor
+ naturally moves upward (in fetch next) in alphabetical order,
+ otherwise downward */
+
+ if (direction == 0) {
+ if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+ moves_up = TRUE;
+ }
+ } else if (direction == ROW_SEL_NEXT) {
+ moves_up = TRUE;
+ }
+
+ mtr_start(&mtr);
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ if (direction != 0) {
+ moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ if (!moved) {
+ goto next_rec;
+ }
+
+ } else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+ btr_pcur_open_with_no_init(index, search_tuple, mode,
+ BTR_SEARCH_LEAF,
+ pcur, 0, &mtr);
+ } else {
+ if (mode == PAGE_CUR_G) {
+ btr_pcur_open_at_index_side(TRUE, index,
+ BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+ } else if (mode == PAGE_CUR_L) {
+ btr_pcur_open_at_index_side(FALSE, index,
+ BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+ }
+ }
+
+ if (!prebuilt->sql_stat_start) {
+ /* No need to set an intention lock or assign a read view */
+
+ } else if (prebuilt->select_lock_type == LOCK_NONE) {
+ /* This is a consistent read */
+ trx_start_if_not_started(trx);
+
+ /* Assign a read view for the query */
+
+ trx_assign_read_view(trx);
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ trx_start_if_not_started(trx);
+
+ if (prebuilt->select_lock_type == LOCK_S) {
+ err = lock_table(0, index->table, LOCK_IS, thr);
+ } else {
+ err = lock_table(0, index->table, LOCK_IX, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ prebuilt->sql_stat_start = FALSE;
+ }
+
+ /*-------------------------------------------------------------*/
+rec_loop:
+ cons_read_requires_clust_rec = FALSE;
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. */
+
+ goto next_rec;
+ }
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record */
+
+ err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type,
+ thr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ /* A page supremum record cannot be in the result set: skip
+ it now when we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (unique_search_from_clust_index && btr_pcur_get_up_match(pcur)
+ == dtuple_get_n_fields(search_tuple)) {
+ /* The record matches enough */
+
+ ut_ad(mode == PAGE_CUR_GE);
+
+ } else if (match_mode == ROW_SEL_EXACT) {
+ /* Test if the index record matches completely to search_tuple
+ in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+ if (0 != cmp_dtuple_rec(search_tuple, rec)) {
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ ret = DB_RECORD_NOT_FOUND;
+
+ goto normal_return;
+ }
+
+ } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+ if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) {
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ ret = DB_RECORD_NOT_FOUND;
+
+ goto normal_return;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ /* Get the right version of the row in a consistent read */
+
+ if (prebuilt->select_lock_type == LOCK_NONE) {
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ cons_read_requires_clust_rec = FALSE;
+
+ if (index == clust_index) {
+
+ if (!lock_clust_rec_cons_read_sees(rec, index,
+ trx->read_view)) {
+
+ err = row_sel_build_prev_vers_for_mysql(
+ trx->read_view, clust_index,
+ prebuilt, rec,
+ &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row did not exist yet in
+ the read view */
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, index,
+ trx->read_view)) {
+ /* We are looking into a non-clustered index,
+ and to get the right version of the record we
+ have to look also into the clustered index: this
+ is necessary, because we can only get the undo
+ information via the clustered index record. */
+
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+
+ if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) {
+
+ /* The record is delete marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ goto next_rec;
+ }
+
+ /* Get the clustered index record if needed and if we did
+ not do the search using the clustered index */
+
+ index_rec = rec;
+
+ if (index != clust_index && (cons_read_requires_clust_rec
+ || prebuilt->need_to_access_clustered)) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ mtr_has_extra_clust_latch = TRUE;
+
+ err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+ thr, &clust_rec, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec)) {
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ rec = clust_rec;
+ }
+
+ /* We found a qualifying row */
+
+ if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD
+ && !prebuilt->templ_contains_blob
+ && prebuilt->select_lock_type == LOCK_NONE
+ && !prebuilt->clust_index_was_generated) {
+
+ /* Inside an update, for example, we do not cache rows,
+ since we may use the cursor position to do the actual
+ update, that is why we require ...lock_type == LOCK_NONE */
+
+ row_sel_push_cache_row_for_mysql(prebuilt, rec);
+
+ if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
+
+ goto got_row;
+ }
+
+ goto next_rec;
+ } else {
+ row_sel_store_mysql_rec(buf, prebuilt, rec);
+
+ if (prebuilt->clust_index_was_generated) {
+ row_sel_store_row_id_to_prebuilt(prebuilt, index_rec,
+ index);
+ }
+ }
+got_row:
+ /* TODO: should we in every case store the cursor position, even
+ if this is just a join, for example? */
+
+ if (!unique_search_from_clust_index
+ || prebuilt->select_lock_type == LOCK_X) {
+
+ /* Inside an update always store the cursor position */
+
+ btr_pcur_store_position(pcur, &mtr);
+ }
+
+ ret = DB_SUCCESS;
+
+ goto normal_return;
+ /*-------------------------------------------------------------*/
+next_rec:
+ if (mtr_has_extra_clust_latch) {
+ /* We must commit mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ mtr_commit(&mtr);
+ mtr_has_extra_clust_latch = FALSE;
+
+ mtr_start(&mtr);
+ moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ if (moved) {
+ goto rec_loop;
+ }
+ }
+
+ if (moves_up) {
+ moved = btr_pcur_move_to_next(pcur, &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(pcur, &mtr);
+ }
+
+ if (!moved) {
+ btr_pcur_store_position(pcur, &mtr);
+
+ if (match_mode != 0) {
+ ret = DB_RECORD_NOT_FOUND;
+ } else {
+ ret = DB_END_OF_INDEX;
+ }
+
+ goto normal_return;
+ }
+
+ goto rec_loop;
+ /*-------------------------------------------------------------*/
+lock_wait_or_error:
+ btr_pcur_store_position(pcur, &mtr);
+
+ mtr_commit(&mtr);
+ mtr_has_extra_clust_latch = FALSE;
+
+ trx->error_state = err;
+
+ /* The following is a patch for MySQL */
+
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ mtr_start(&mtr);
+
+ sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ mode = pcur->search_mode;
+
+ goto rec_loop;
+ }
+
+ return(err);
+
+normal_return:
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ mtr_commit(&mtr);
+
+ if (prebuilt->n_fetch_cached > 0) {
+ row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+ ret = DB_SUCCESS;
+ }
+
+ return(ret);
+}
diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c
new file mode 100644
index 00000000000..68115895dbb
--- /dev/null
+++ b/innobase/row/row0uins.c
@@ -0,0 +1,308 @@
+/******************************************************
+Fresh insert undo
+
+(c) 1996 Innobase Oy
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*******************************************************************
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached. */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ btr_cur_t* btr_cur;
+ ibool success;
+ ulint err;
+ ulint n_tries = 0;
+ mtr_t mtr;
+
+ UT_NOT_USED(thr);
+
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+ &mtr);
+ ut_a(success);
+
+ if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+ /* Drop the index tree associated with the row in
+ SYS_INDEXES table: */
+
+ dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+ &(node->pcur), &mtr);
+ ut_a(success);
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ if (success) {
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(DB_SUCCESS);
+ }
+retry:
+ /* If did not succeed, try pessimistic descent to tree */
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+ &(node->pcur), &mtr);
+ ut_a(success);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err == DB_OUT_OF_FILE_SPACE
+ && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry if found. */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+ /* out: DB_SUCCESS, DB_FAIL, or
+ DB_OUT_OF_FILE_SPACE */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to remove */
+ que_thr_t* thr) /* in: query thread */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool found;
+ ibool success;
+ ulint err;
+ mtr_t mtr;
+
+ UT_NOT_USED(thr);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (!found) {
+ /* Not found */
+
+ /* FIXME: remove printfs in the final version */
+
+ /* printf(
+ "--UNDO INS: Record not found from page %lu index %s\n",
+ buf_frame_get_page_no(btr_cur_get_rec(btr_cur)),
+ index->name); */
+
+ /* ibuf_print(); */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree. */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+ ulint n_tries = 0;
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+retry:
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry, thr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ dulint undo_no;
+ dulint table_id;
+ ulint type;
+ ulint dummy;
+
+ ut_ad(node && thr);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, &undo_no,
+ &table_id);
+ ut_ad(type == TRX_UNDO_INSERT_REC);
+ node->rec_type = type;
+
+ /* NOTE that the table has to be explicitly released later */
+ node->table = dict_table_get_on_id(table_id, node->trx);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+}
+
+/***************************************************************
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. */
+
+ulint
+row_undo_ins(
+/*=========*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dtuple_t* entry;
+ ibool found;
+ ulint err;
+
+ ut_ad(node && thr);
+ ut_ad(node->state == UNDO_NODE_INSERT);
+
+ row_undo_ins_parse_undo_rec(node, thr);
+
+ found = row_undo_search_clust_to_pcur(node, thr);
+
+ if (!found) {
+ return(DB_SUCCESS);
+ }
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ while (node->index != NULL) {
+ entry = row_build_index_entry(node->row, node->index,
+ node->heap);
+ err = row_undo_ins_remove_sec(node->index, entry, thr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ err = row_undo_ins_remove_clust_rec(node, thr);
+
+ return(err);
+}
diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c
new file mode 100644
index 00000000000..2aa223a6186
--- /dev/null
+++ b/innobase/row/row0umod.c
@@ -0,0 +1,608 @@
+/******************************************************
+Undo modify of a row
+
+(c) 1997 Innobase Oy
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/***************************************************************
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback. */
+UNIV_INLINE
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+ /* out: TRUE if also previous modify or
+ insert of this row should be undone */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dulint* undo_no)/* out: the undo number */
+{
+ trx_undo_rec_t* undo_rec;
+ ibool ret;
+ trx_t* trx;
+
+ UT_NOT_USED(thr);
+
+ trx = node->trx;
+
+ if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+ return(FALSE);
+ }
+
+ undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+ *undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+ if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) {
+ ret = TRUE;
+ } else {
+ ret = FALSE;
+ }
+
+ return(ret);
+}
+
+/***************************************************************
+Undoes a modify in a clustered index record. */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+ /* out: DB_SUCCESS, DB_FAIL, or error code:
+ we may run out of file space */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr, /* in: mtr */
+ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool success;
+ ibool do_remove;
+
+ index = dict_table_get_first_index(node->table);
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ success = btr_pcur_restore_position(mode, pcur, mtr);
+
+ ut_ad(success);
+
+ /* Find out if we can remove the whole clustered index record */
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+ && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+ do_remove = TRUE;
+ } else {
+ do_remove = FALSE;
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+
+ if (do_remove) {
+ success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ if (do_remove) {
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ } else {
+ err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo. */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+ /* out: DB_SUCCESS or error code: we may run
+ out of file space */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ btr_pcur_t* pcur;
+ mtr_t mtr;
+ ulint err;
+ ibool success;
+ ibool more_vers;
+ dulint new_undo_no;
+
+ ut_ad(node && thr);
+
+ /* Check if also the previous version of the clustered index record
+ should be undone in this same rollback operation */
+
+ more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no);
+
+ pcur = &(node->pcur);
+
+ mtr_start(&mtr);
+
+ /* Try optimistic processing of the record, keeping changes within
+ the index page */
+
+ err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a pessimistic
+ descent down the index tree */
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+ }
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ if (more_vers && err == DB_SUCCESS) {
+
+ /* Reserve the undo log record to the prior version after
+ committing &mtr: this is necessary to comply with the latching
+ order, as &mtr may contain the fsp latch which is lower in
+ the latch hierarchy than trx->undo_mutex. */
+
+ success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+ if (success) {
+ node->state = UNDO_NODE_PREV_VERS;
+ }
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found. */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+ /* out: DB_SUCCESS, DB_FAIL, or
+ DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode) /* in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ ibool found;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has;
+ ulint err;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (!found) {
+ /* Not found */
+
+ /* FIXME: remove printfs in the final version */
+
+ /* printf(
+ "--UNDO MOD: Record not found from page %lu index %s\n",
+ buf_frame_get_page_no(btr_cur_get_rec(btr_cur)),
+ index->name); */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ /* We should remove the index record if no prior version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should delete mark the record. */
+
+ mtr_start(&mtr_vers);
+
+ success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+ &mtr_vers);
+ ut_ad(success);
+
+ old_has = row_vers_old_has_index_entry(FALSE,
+ btr_pcur_get_rec(&(node->pcur)),
+ &mtr_vers, index, entry);
+ if (old_has) {
+ err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, &mtr);
+ ut_ad(err == DB_SUCCESS);
+ } else {
+ /* Remove the index record */
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found. */
+UNIV_INLINE
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ ulint err;
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_LEAF);
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_TREE);
+ return(err);
+}
+
+/***************************************************************
+Delete unmarks a secondary index entry which must be found. */
+static
+void
+row_undo_mod_del_unmark_sec(
+/*========================*/
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool found;
+
+ UT_NOT_USED(node);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+ &mtr);
+ ut_a(found);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, FALSE, thr, &mtr);
+ ut_ad(err == DB_SUCCESS);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_DEL. */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ entry = row_build_index_entry(node->row, index, heap);
+
+ err = row_undo_mod_del_mark_or_remove_sec(node, thr, index,
+ entry);
+ if (err != DB_SUCCESS) {
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is DEL_MARK. */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+ /* out: DB_SUCCESS */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_undo_mod_del_unmark_sec(node, thr, index, entry);
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+ /* out: DB_SUCCESS or error code */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ /* No change in secondary indexes */
+
+ return(DB_SUCCESS);
+ }
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ if (row_upd_changes_ord_field(node->row, node->index,
+ node->update)) {
+
+ /* Build the newest version of the index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ err = row_undo_mod_del_mark_or_remove_sec(node, thr,
+ index, entry);
+ if (err != DB_SUCCESS) {
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ /* We may have to update the delete mark in the
+ secondary index record of the previous version of
+ the row */
+
+ row_upd_index_replace_new_col_vals(entry, index,
+ node->update);
+
+ row_undo_mod_del_unmark_sec(node, thr, index, entry);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ dulint undo_no;
+ dulint table_id;
+ dulint trx_id;
+ dulint roll_ptr;
+ ulint info_bits;
+ ulint type;
+ ulint cmpl_info;
+
+ ut_ad(node && thr);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ &undo_no, &table_id);
+ node->rec_type = type;
+
+ /* NOTE that the table has to be explicitly released later */
+ node->table = dict_table_get_on_id(table_id, thr_get_trx(thr));
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits, node->heap,
+ &(node->update));
+ node->new_roll_ptr = roll_ptr;
+ node->new_trx_id = trx_id;
+ node->cmpl_info = cmpl_info;
+}
+
+/***************************************************************
+Undoes a modify operation on a row of a table. */
+
+ulint
+row_undo_mod(
+/*=========*/
+ /* out: DB_SUCCESS or error code */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ibool found;
+ ulint err;
+
+ ut_ad(node && thr);
+ ut_ad(node->state == UNDO_NODE_MODIFY);
+
+ row_undo_mod_parse_undo_rec(node, thr);
+
+ found = row_undo_search_clust_to_pcur(node, thr);
+
+ if (!found) {
+ /* It is already undone, or will be undone by another query
+ thread */
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+
+ return(DB_SUCCESS);
+ }
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+ err = row_undo_mod_upd_exist_sec(node, thr);
+
+ } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+ err = row_undo_mod_del_mark_sec(node, thr);
+ } else {
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ err = row_undo_mod_upd_del_sec(node, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_clust(node, thr);
+
+ return(err);
+}
diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c
new file mode 100644
index 00000000000..6dc032f7e13
--- /dev/null
+++ b/innobase/row/row0undo.c
@@ -0,0 +1,313 @@
+/******************************************************
+Row undo
+
+(c) 1997 Innobase Oy
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/************************************************************************
+Creates a row undo node to a query graph. */
+
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ /* out, own: undo node */
+ trx_t* trx, /* in: transaction */
+ que_thr_t* parent, /* in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /* in: memory heap where created */
+{
+ undo_node_t* undo;
+
+ ut_ad(trx && parent && heap);
+
+ undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+ undo->common.type = QUE_NODE_UNDO;
+ undo->common.parent = parent;
+
+ undo->state = UNDO_NODE_FETCH_NEXT;
+ undo->trx = trx;
+
+ undo->heap = mem_heap_create(256);
+
+ return(undo);
+}
+
+/***************************************************************
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case. */
+
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ /* out: TRUE if found; NOTE the node->pcur
+ must be closed by the caller, regardless of
+ the return value */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* clust_index;
+ ibool found;
+ mtr_t mtr;
+ ibool ret;
+ rec_t* rec;
+
+ UT_NOT_USED(thr);
+
+ mtr_start(&mtr);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+ node->table, node->ref, &mtr);
+
+ rec = btr_pcur_get_rec(&(node->pcur));
+
+ if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+ row_get_rec_roll_ptr(rec, clust_index))) {
+
+ /* We must remove the reservation on the undo log record
+ BEFORE releasing the latch on the clustered index page: this
+ is to make sure that some thread will eventually undo the
+ modification corresponding to node->roll_ptr. */
+
+ /* printf("--------------------undoing a previous version\n");
+ */
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ ret = FALSE;
+ } else {
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+ node->heap);
+ btr_pcur_store_position(&(node->pcur), &mtr);
+
+ ret = TRUE;
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ return(ret);
+}
+
+/***************************************************************
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_undo(
+/*=====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+ trx_t* trx;
+ dulint roll_ptr;
+
+ ut_ad(node && thr);
+
+ trx = node->trx;
+
+ if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+ /* The call below also starts &mtr */
+ node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+ trx->roll_limit,
+ &roll_ptr,
+ node->heap);
+ if (!node->undo_rec) {
+ /* Rollback completed for this query thread */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(DB_SUCCESS);
+ }
+
+ node->roll_ptr = roll_ptr;
+ node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ node->state = UNDO_NODE_INSERT;
+ } else {
+ node->state = UNDO_NODE_MODIFY;
+ }
+
+ } else if (node->state == UNDO_NODE_PREV_VERS) {
+
+ /* Undo should be done to the same clustered index record
+ again in this same rollback, restoring the previous version */
+
+ roll_ptr = node->new_roll_ptr;
+
+ node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+ node->heap);
+ node->roll_ptr = roll_ptr;
+ node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ node->state = UNDO_NODE_INSERT;
+ } else {
+ node->state = UNDO_NODE_MODIFY;
+ }
+ }
+
+ if (node->state == UNDO_NODE_INSERT) {
+
+ err = row_undo_ins(node, thr);
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+ } else {
+ ut_ad(node->state == UNDO_NODE_MODIFY);
+ err = row_undo_mod(node, thr);
+ }
+
+ /* Do some cleanup */
+ btr_pcur_close(&(node->pcur));
+
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(err);
+}
+
+/***************************************************************
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs. */
+
+que_thr_t*
+row_undo_step(
+/*==========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+ undo_node_t* node;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ srv_activity_count++;
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+ err = row_undo(node, thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* SQL error detected */
+
+ ut_a(0);
+
+ return(NULL);
+ }
+
+ return(thr);
+}
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
new file mode 100644
index 00000000000..44843494247
--- /dev/null
+++ b/innobase/row/row0upd.c
@@ -0,0 +1,1394 @@
+/******************************************************
+Update of a row
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+ -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+ Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+Creates an update node for a query graph. */
+
+upd_node_t*
+upd_node_create(
+/*============*/
+ /* out, own: update node */
+ mem_heap_t* heap) /* in: mem heap where created */
+{
+ upd_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(upd_node_t));
+ node->common.type = QUE_NODE_UPDATE;
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ node->select_will_do_update = FALSE;
+ node->in_mysql_interface = FALSE;
+
+ node->row = NULL;
+ node->index = NULL;
+
+ node->select = NULL;
+
+ node->heap = mem_heap_create(128);
+ node->magic_n = UPD_NODE_MAGIC_N;
+
+ node->cmpl_info = 0;
+
+ return(node);
+}
+
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+ rec_t* rec, /* in: record */
+ ulint pos, /* in: TRX_ID position in rec */
+ dulint trx_id, /* in: transaction id */
+ dulint roll_ptr)/* in: roll ptr of the undo log record */
+{
+ byte* field;
+ ulint len;
+
+ field = rec_get_nth_field(rec, pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(field, trx_id);
+
+ field = rec_get_nth_field(rec, pos + 1, &len);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(field, roll_ptr);
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field of a clustered index entry. */
+
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+ dtuple_t* entry, /* in: index entry, where the memory buffers
+ for sys fields are already allocated:
+ the function just copies the new values to
+ them */
+ dict_index_t* index, /* in: clustered index */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ dulint val) /* in: value to write */
+{
+ dfield_t* dfield;
+ byte* field;
+ ulint pos;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ dfield = dtuple_get_nth_field(entry, pos);
+ field = dfield_get_data(dfield);
+
+ if (type == DATA_TRX_ID) {
+ trx_write_trx_id(field, val);
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+ trx_write_roll_ptr(field, val);
+ }
+}
+
+/***************************************************************
+Returns TRUE if row update changes size of some field in index. */
+
+ibool
+row_upd_changes_field_size(
+/*=======================*/
+ /* out: TRUE if the update changes the size of
+ some field in index */
+ rec_t* rec, /* in: record in clustered index */
+ dict_index_t* index, /* in: clustered index */
+ upd_t* update) /* in: update vector */
+{
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint old_len;
+ ulint new_len;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ new_val = &(upd_field->new_val);
+ new_len = new_val->len;
+
+ if (new_len == UNIV_SQL_NULL) {
+ new_len = dtype_get_sql_null_size(
+ dict_index_get_nth_type(index, i));
+ }
+
+ old_len = rec_get_nth_field_size(rec, upd_field->field_no);
+
+ if (old_len != new_len) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. This function is used only for
+a clustered index */
+
+void
+row_upd_rec_in_place(
+/*=================*/
+ rec_t* rec, /* in/out: record where replaced */
+ upd_t* update) /* in: update vector */
+{
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint n_fields;
+ ulint i;
+
+ rec_set_info_bits(rec, update->info_bits);
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+
+ rec_set_nth_field(rec, upd_field->field_no,
+ dfield_get_data(new_val),
+ dfield_get_len(new_val));
+ }
+}
+
+/*************************************************************************
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record. */
+
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+ /* out: new pointer to mlog */
+ dict_index_t* index, /* in: clustered index */
+ trx_t* trx, /* in: transaction */
+ dulint roll_ptr,/* in: roll ptr of the undo log record */
+ byte* log_ptr,/* pointer to a buffer of size > 20 opened
+ in mlog */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(mtr);
+
+ log_ptr += mach_write_compressed(log_ptr,
+ dict_index_get_sys_col_pos(index, DATA_TRX_ID));
+
+ trx_write_roll_ptr(log_ptr, roll_ptr);
+ log_ptr += DATA_ROLL_PTR_LEN;
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+ return(log_ptr);
+}
+
+/*************************************************************************
+Parses the log data of system field values. */
+
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+ /* out: log data end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ ulint* pos, /* out: TRX_ID position in record */
+ dulint* trx_id, /* out: trx id */
+ dulint* roll_ptr)/* out: roll ptr */
+{
+ ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+ return(NULL);
+ }
+
+ *roll_ptr = trx_read_roll_ptr(ptr);
+ ptr += DATA_ROLL_PTR_LEN;
+
+ ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+ return(ptr);
+}
+
+/***************************************************************
+Writes to the redo log the new values of the fields occurring in the index. */
+
+void
+row_upd_index_write_log(
+/*====================*/
+ upd_t* update, /* in: update vector */
+ byte* log_ptr,/* in: pointer to mlog buffer: must contain at least
+ MLOG_BUF_MARGIN bytes of free space; the buffer is
+ closed within this function */
+ mtr_t* mtr) /* in: mtr into whose log to write */
+{
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint len;
+ ulint n_fields;
+ byte* buf_end;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+
+ mach_write_to_1(log_ptr, update->info_bits);
+ log_ptr++;
+ log_ptr += mach_write_compressed(log_ptr, n_fields);
+
+ for (i = 0; i < n_fields; i++) {
+
+ ut_ad(MLOG_BUF_MARGIN > 30);
+
+ if (log_ptr + 30 > buf_end) {
+ mlog_close(mtr, log_ptr);
+
+ log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+ }
+
+ upd_field = upd_get_nth_field(update, i);
+
+ new_val = &(upd_field->new_val);
+
+ len = new_val->len;
+
+ log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+ log_ptr += mach_write_compressed(log_ptr, len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (log_ptr + len < buf_end) {
+ ut_memcpy(log_ptr, new_val->data, len);
+
+ log_ptr += len;
+ } else {
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, new_val->data, len);
+
+ log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+ }
+ }
+ }
+
+ mlog_close(mtr, log_ptr);
+}
+
+/*************************************************************************
+Parses the log data written by row_upd_index_write_log. */
+
+byte*
+row_upd_index_parse(
+/*================*/
+ /* out: log data end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ mem_heap_t* heap, /* in: memory heap where update vector is
+ built */
+ upd_t** update_out)/* out: update vector */
+{
+ upd_t* update;
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint len;
+ ulint n_fields;
+ byte* buf;
+ ulint info_bits;
+ ulint i;
+
+ if (end_ptr < ptr + 1) {
+
+ return(NULL);
+ }
+
+ info_bits = mach_read_from_1(ptr);
+ ptr++;
+ ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ update = upd_create(n_fields, heap);
+ update->info_bits = info_bits;
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+
+ ptr = mach_parse_compressed(ptr, end_ptr,
+ &(upd_field->field_no));
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ new_val->len = len;
+
+ if (len != UNIV_SQL_NULL) {
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ } else {
+ buf = mem_heap_alloc(heap, len);
+ ut_memcpy(buf, ptr, len);
+
+ ptr += len;
+
+ new_val->data = buf;
+ }
+ }
+ }
+
+ *update_out = update;
+
+ return(ptr);
+}
+
+/*******************************************************************
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. */
+
+upd_t*
+row_upd_build_difference(
+/*=====================*/
+ /* out, own: update vector of differing
+ fields, excluding roll ptr and trx id */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t* entry, /* in: entry to insert */
+ rec_t* rec, /* in: clustered index record */
+ mem_heap_t* heap) /* in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ ulint roll_ptr_pos;
+ ulint trx_id_pos;
+ ulint i;
+
+ /* This function is used only for a clustered index */
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+
+ roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+ trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, i, &len);
+ dfield = dtuple_get_nth_field(entry, i);
+
+ if ((i != trx_id_pos) && (i != roll_ptr_pos)
+ && !dfield_data_is_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index);
+
+ n_diff++;
+ }
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ dict_index_t* index, /* in: index; NOTE that may also be a
+ non-clustered index */
+ upd_t* update) /* in: update vector */
+{
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ dfield_t* new_val;
+ ulint field_no;
+ dict_index_t* clust_index;
+ ulint i;
+
+ ut_ad(index);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ field_no = dict_index_get_nth_col_pos(index,
+ dict_index_get_nth_col_no(clust_index,
+ upd_field->field_no));
+ if (field_no != ULINT_UNDEFINED) {
+ dfield = dtuple_get_nth_field(entry, field_no);
+
+ new_val = &(upd_field->new_val);
+
+ dfield_set_data(dfield, new_val->data, new_val->len);
+ }
+ }
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the
+clustered index entry given. */
+
+void
+row_upd_clust_index_replace_new_col_vals(
+/*=====================================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ upd_t* update) /* in: update vector */
+{
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ dfield_t* new_val;
+ ulint field_no;
+ ulint i;
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ field_no = upd_field->field_no;
+
+ dfield = dtuple_get_nth_field(entry, field_no);
+
+ new_val = &(upd_field->new_val);
+
+ dfield_set_data(dfield, new_val->data, new_val->len);
+ }
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic. */
+
+ibool
+row_upd_changes_ord_field(
+/*======================*/
+ /* out: TRUE if update vector changes
+ an ordering field in the index record */
+ dtuple_t* row, /* in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ dict_index_t* index, /* in: index of the record */
+ upd_t* update) /* in: update vector for the row */
+{
+ upd_field_t* upd_field;
+ dict_field_t* ind_field;
+ dict_col_t* col;
+ ulint n_unique;
+ ulint n_upd_fields;
+ ulint col_pos;
+ ulint col_no;
+ ulint i, j;
+
+ ut_ad(update && index);
+
+ n_unique = dict_index_get_n_unique(index);
+ n_upd_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_unique; i++) {
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col);
+ col_no = dict_col_get_no(col);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ upd_field = upd_get_nth_field(update, j);
+
+ if (col_pos == upd_field->field_no
+ && (row == NULL
+ || !dfield_datas_are_equal(
+ dtuple_get_nth_field(row, col_no),
+ &(upd_field->new_val)))) {
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic. */
+
+ibool
+row_upd_changes_some_index_ord_field(
+/*=================================*/
+ /* out: TRUE if update vector may change
+ an ordering field in an index record */
+ dict_table_t* table, /* in: table */
+ upd_t* update) /* in: update vector for the row */
+{
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+
+ while (index) {
+ if (row_upd_changes_ord_field(NULL, index, update)) {
+
+ return(TRUE);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+ rec_t* rec, /* in: record in a clustered index */
+ sym_node_t* column) /* in: first column in a column list, or
+ NULL */
+{
+ byte* data;
+ ulint len;
+
+ while (column) {
+ data = rec_get_nth_field(rec,
+ column->field_nos[SYM_CLUST_FIELD_NO],
+ &len);
+ eval_node_copy_and_alloc_val(column, data, len);
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*************************************************************************
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+ upd_t* update) /* in: update vector */
+{
+ que_node_t* exp;
+ upd_field_t* upd_field;
+ ulint n_fields;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ exp = upd_field->exp;
+
+ eval_exp(exp);
+
+ dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+ }
+}
+
+/***************************************************************
+Stores to the heap the row on which the node->pcur is positioned. */
+UNIV_INLINE
+void
+row_upd_store_row(
+/*==============*/
+ upd_node_t* node) /* in: row update node */
+{
+ dict_index_t* clust_index;
+
+ ut_ad((node->pcur)->latch_mode != BTR_NO_LATCHES);
+
+ if (node->row != NULL) {
+ mem_heap_empty(node->heap);
+ node->row = NULL;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ node->row = row_build(ROW_COPY_DATA, clust_index,
+ btr_pcur_get_rec(node->pcur), node->heap);
+}
+
+/***************************************************************
+Updates a secondary index entry of a row. */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ibool found;
+ dict_index_t* index;
+ dtuple_t* entry;
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ mem_heap_t* heap;
+ rec_t* rec;
+ ulint err;
+
+ index = node->index;
+
+ heap = mem_heap_create(1024);
+
+ /* Build old index entry */
+ entry = row_build_index_entry(node->row, index, heap);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+ &mtr);
+ ut_ad(found);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ rec = btr_cur_get_rec(btr_cur);
+
+ /* Delete mark the old index record; it can already be delete marked if
+ we return after a lock wait in row_ins_index_entry below */
+
+ if (!rec_get_deleted_flag(rec)) {
+ err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, thr,
+ &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (node->is_delete || (err != DB_SUCCESS)) {
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ /* Build a new index entry */
+ row_upd_index_replace_new_col_vals(entry, index, node->update);
+
+ /* Insert new index entry */
+ err = row_ins_index_entry(index, entry, thr);
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***************************************************************
+Updates secondary index record if it is changed in the row update. This
+should be quite rare in database applications. */
+UNIV_INLINE
+ulint
+row_upd_sec_step(
+/*=============*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+ || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+ ut_ad(!(node->index->type & DICT_CLUSTERED));
+
+ if ((node->state == UPD_NODE_UPDATE_ALL_SEC)
+ || row_upd_changes_ord_field(node->row, node->index,
+ node->update)) {
+ err = row_upd_sec_index_entry(node, thr);
+
+ return(err);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications. */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ dict_index_t* index, /* in: clustered index of the record */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr; gets committed here */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ trx_t* trx;
+ dict_table_t* table;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ trx = thr_get_trx(thr);
+ table = node->table;
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+
+ err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, mtr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+
+ return(err);
+ }
+ }
+
+ mtr_commit(mtr);
+
+ node->state = UPD_NODE_INSERT_CLUSTERED;
+
+ heap = mem_heap_create(1024);
+
+ entry = row_build_index_entry(node->row, index, heap);
+
+ row_upd_clust_index_replace_new_col_vals(entry, node->update);
+
+ row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+
+ err = row_ins_index_entry(index, entry, thr);
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***************************************************************
+Updates a clustered index record of a row when the ordering fields do
+not change. */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ dict_index_t* index, /* in: clustered index */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr; gets committed here */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+
+ /* Try optimistic updating of the record, keeping changes within
+ the page; we do not check locks because we assume the x-lock on the
+ record to update */
+
+ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+ err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ } else {
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+
+ mtr_commit(mtr);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* We may have to modify the tree structure: do a pessimistic descent
+ down the index tree */
+
+ mtr_start(mtr);
+
+ /* NOTE: this transaction has an s-lock or x-lock on the record and
+ therefore other transactions cannot modify the record when we have no
+ latch on the page. In addition, we assume that other query threads of
+ the same transaction do not modify the record in the meantime.
+ Therefore we can assert that the restoration of the cursor succeeds. */
+
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+ ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+
+ err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+ node->update, node->cmpl_info, thr, mtr);
+ mtr_commit(mtr);
+
+ return(err);
+}
+
+/***************************************************************
+Delete marks a clustered index record. */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ dict_index_t* index, /* in: clustered index */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr; gets committed here */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(node->is_delete);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+
+ /* Store row because we have to build also the secondary index
+ entries */
+
+ row_upd_store_row(node);
+
+ /* Mark the clustered index record deleted; we do not have to check
+ locks, because we assume that we have an x-lock on the record */
+
+ err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur,
+ TRUE, thr, mtr);
+ mtr_commit(mtr);
+
+ return(err);
+}
+
+/***************************************************************
+Updates the clustered index record. */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, DB_LOCK_WAIT in case of a lock wait,
+ else error code */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ ibool success;
+ ulint err;
+ mtr_t mtr_buf;
+ mtr_t* mtr;
+
+ index = dict_table_get_first_index(node->table);
+
+ pcur = node->pcur;
+
+ /* We have to restore the cursor to its position */
+ mtr = &mtr_buf;
+
+ mtr_start(mtr);
+
+ /* If the restoration does not succeed, then the same
+ transaction has deleted the record on which the cursor was,
+ and that is an SQL error. If the restoration succeeds, it may
+ still be that the same transaction has successively deleted
+ and inserted a record with the same ordering fields, but in
+ that case we know that the transaction has at least an
+ implicit x-lock on the record. */
+
+ ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+ if (!success) {
+ err = DB_RECORD_NOT_FOUND;
+
+ mtr_commit(mtr);
+
+ return(err);
+ }
+
+ /* If this is a row in SYS_INDEXES table of the data dictionary,
+ then we have to free the file segments of the index tree associated
+ with the index */
+
+ if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+ dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+ mtr);
+ if (!success) {
+ err = DB_ERROR;
+
+ mtr_commit(mtr);
+
+ return(err);
+ }
+ }
+
+ if (!node->has_clust_rec_x_lock) {
+ err = lock_clust_rec_modify_check_and_lock(0,
+ btr_pcur_get_rec(pcur),
+ index, thr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+
+ return(err);
+ }
+ }
+
+ /* NOTE: the following function calls will also commit mtr */
+
+ if (node->is_delete) {
+ err = row_upd_del_mark_clust_rec(node, index, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+ node->index = dict_table_get_next_index(index);
+
+ return(err);
+ }
+
+ /* If the update is made for MySQL, we already have the update vector
+ ready, else we have to do some evaluation: */
+
+ if (!node->in_mysql_interface) {
+ /* Copy the necessary columns from clust_rec and calculate the
+ new values to set */
+
+ row_upd_copy_columns(btr_pcur_get_rec(pcur),
+ UT_LIST_GET_FIRST(node->columns));
+ row_upd_eval_new_vals(node->update);
+ }
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+ err = row_upd_clust_rec(node, index, thr, mtr);
+
+ return(err);
+ }
+
+ row_upd_store_row(node);
+
+ if (row_upd_changes_ord_field(node->row, index, node->update)) {
+
+ /* Update causes an ordering field (ordering fields within
+ the B-tree) of the clustered index record to change: perform
+ the update by delete marking and inserting.
+
+ TODO! What to do to the 'Halloween problem', where an update
+ moves the record forward in index so that it is again
+ updated when the cursor arrives there? Solution: the
+ read operation must check the undo record undo number when
+ choosing records to update. MySQL solves now the problem
+ externally! */
+
+ err = row_upd_clust_rec_by_insert(node, index, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+ } else {
+ err = row_upd_clust_rec(node, index, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_SOME_SEC;
+ }
+
+ node->index = dict_table_get_next_index(index);
+
+ return(err);
+}
+
+/***************************************************************
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor. */
+static
+ulint
+row_upd(
+/*====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ upd_node_t* node, /* in: row update node */
+ que_thr_t* thr) /* in: query thread */
+{
+ ulint err;
+
+ ut_ad(node && thr);
+
+ if (node->in_mysql_interface) {
+ /* We do not get the cmpl_info value from the MySQL
+ interpreter: we must calculate it on the fly: */
+
+ if (row_upd_changes_some_index_ord_field(node->table,
+ node->update)) {
+ node->cmpl_info = 0;
+ } else {
+ node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+ }
+ }
+
+ if (node->state == UPD_NODE_UPDATE_CLUSTERED
+ || node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+ err = row_upd_clust_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+ }
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+ goto function_exit;
+ }
+
+ while (node->index != NULL) {
+ err = row_upd_sec_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+function_exit:
+ if (err == DB_SUCCESS) {
+ /* Do some cleanup */
+
+ if (node->row != NULL) {
+ mem_heap_empty(node->heap);
+ node->row = NULL;
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_upd_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ upd_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* parent;
+ ulint err = DB_SUCCESS;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ sel_node = node->select;
+
+ parent = que_node_get_parent(node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ if (thr->prev_node == parent) {
+ node->state = UPD_NODE_SET_IX_LOCK;
+ }
+
+ if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+ if (!node->has_clust_rec_x_lock) {
+ /* It may be that the current session has not yet
+ started its transaction, or it has been committed: */
+
+ trx_start_if_not_started(thr_get_trx(thr));
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ if (node->searched_update) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to update */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ /* sel_node is NULL if we are in the MySQL interface */
+
+ if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+ if (!node->searched_update) {
+ /* An explicit cursor should be positioned on a row
+ to update */
+
+ ut_error;
+
+ err = DB_ERROR;
+
+ goto error_handling;
+ }
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to update, or the select node performed the
+ updates directly in-place */
+
+ thr->run_node = parent;
+
+ return(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_upd(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->searched_update) {
+ /* Fetch next row to update */
+
+ thr->run_node = sel_node;
+ } else {
+ /* It was an explicit cursor update */
+
+ thr->run_node = parent;
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ return(thr);
+}
+
+/*************************************************************************
+Performs an in-place update for the current clustered index record in
+select. */
+
+void
+row_upd_in_place_in_select(
+/*=======================*/
+ sel_node_t* sel_node, /* in: select node */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr) /* in: mtr */
+{
+ upd_node_t* node;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(sel_node->select_will_do_update);
+ ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF);
+ ut_ad(sel_node->asc);
+
+ node = que_node_get_parent(sel_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ /* Copy the necessary columns from clust_rec and calculate the new
+ values to set */
+
+ row_upd_copy_columns(btr_pcur_get_rec(pcur),
+ UT_LIST_GET_FIRST(node->columns));
+ row_upd_eval_new_vals(node->update);
+
+ ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+
+ ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE);
+ ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+ ut_ad(node->select_will_do_update);
+
+ err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur,
+ node->update, node->cmpl_info,
+ thr, mtr);
+ ut_ad(err == DB_SUCCESS);
+}
diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c
new file mode 100644
index 00000000000..80acc7225df
--- /dev/null
+++ b/innobase/row/row0vers.c
@@ -0,0 +1,409 @@
+/******************************************************
+Row versions
+
+(c) 1997 Innobase Oy
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+
+/*********************************************************************
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function! */
+
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+ /* out: NULL if committed, else the active
+ transaction; NOTE that the kernel mutex is
+ temporarily released! */
+ rec_t* rec, /* in: record in a secondary index */
+ dict_index_t* index) /* in: the secondary index */
+{
+ dict_index_t* clust_index;
+ rec_t* clust_rec;
+ rec_t* version;
+ rec_t* prev_version;
+ dulint trx_id;
+ dulint prev_trx_id;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ dtuple_t* entry;
+ trx_t* trx;
+ ibool vers_del;
+ ibool rec_del;
+ ulint err;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ /* Search for the clustered index record: this is a time-consuming
+ operation: therefore we release the kernel mutex; also, the release
+ is required by the latching order convention. The latch on the
+ clustered index locks the top of the stack of versions. We also
+ reserve purge_latch to lock the bottom of the version stack. */
+
+ clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+ &clust_index, &mtr);
+ ut_a(clust_rec);
+
+ trx_id = row_get_rec_trx_id(clust_rec, clust_index);
+
+ mtr_s_lock(&(purge_sys->latch), &mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ if (!trx_is_active(trx_id)) {
+ /* The transaction that modified or inserted clust_rec is no
+ longer active: no implicit lock on rec */
+
+ mtr_commit(&mtr);
+
+ return(NULL);
+ }
+
+ /* We look up if some earlier version of the clustered index record
+ would require rec to be in a different state (delete marked or
+ unmarked, or not existing). If there is such a version, then rec was
+ modified by the trx_id transaction, and it has an implicit x-lock on
+ rec. Note that if clust_rec itself would require rec to be in a
+ different state, then the trx_id transaction has not yet had time to
+ modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+ rec_del = rec_get_deleted_flag(rec);
+ trx = NULL;
+
+ version = clust_rec;
+ heap = NULL;
+
+ for (;;) {
+ mutex_exit(&kernel_mutex);
+
+ /* While we retrieve an earlier version of clust_rec, we
+ release the kernel mutex, because it may take time to access
+ the disk. After the release, we have to check if the trx_id
+ transaction is still active. We keep the semaphore in mtr on
+ the clust_rec page, so that no other transaction can update
+ it and get an implicit x-lock on rec. */
+
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+ clust_index, heap,
+ &prev_version);
+ if (heap2) {
+ mem_heap_free(heap2); /* version was stored in heap2,
+ if heap2 != NULL */
+ }
+
+ if (prev_version) {
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, heap);
+ entry = row_build_index_entry(row, index, heap);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ if (!trx_is_active(trx_id)) {
+ /* Transaction no longer active: no implicit x-lock */
+
+ break;
+ }
+
+ /* If the transaction is still active, the previous version
+ of clust_rec must be accessible if not a fresh insert; we
+ may assert the following: */
+
+ ut_ad(err == DB_SUCCESS);
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version: there is an
+ implicit x-lock on rec */
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ /* If we get here, we know that the trx_id transaction is
+ still active and it has modified prev_version. Let us check
+ if prev_version would require rec to be in a different state. */
+
+ vers_del = rec_get_deleted_flag(prev_version);
+
+ if (0 == cmp_dtuple_rec(entry, rec)) {
+ /* The delete marks of rec and prev_version should be
+ equal for rec to be in the state required by
+ prev_version */
+
+ if (rec_del != vers_del) {
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+ } else if (!rec_del) {
+ /* The delete mark should be set in rec for it to be
+ in the state required by prev_version */
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ prev_trx_id = row_get_rec_trx_id(prev_version, clust_index);
+
+ if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+ /* The versions modified by the trx_id transaction end
+ to prev_version: no implicit x-lock */
+
+ break;
+ }
+
+ version = prev_version;
+ }/* for (;;) */
+
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(trx);
+}
+
+/*********************************************************************
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view. */
+
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+ /* out: TRUE if earlier version should be preserved */
+ dulint trx_id, /* in: transaction id in the version */
+ mtr_t* mtr) /* in: mtr holding the latch on the clustered index
+ record; it will also hold the latch on purge_view */
+{
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+
+ mtr_s_lock(&(purge_sys->latch), mtr);
+
+ if (trx_purge_update_undo_must_exist(trx_id)) {
+
+ /* A purge operation is not yet allowed to remove this
+ delete marked record */
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE. */
+
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+ /* out: TRUE if earlier version should have */
+ ibool also_curr,/* in: TRUE if also rec is included in the
+ versions to search; otherwise only versions
+ prior to it are searched */
+ rec_t* rec, /* in: record in the clustered index; the
+ caller must have a latch on the page */
+ mtr_t* mtr, /* in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /* in: the secondary index */
+ dtuple_t* ientry) /* in: the secondary index entry */
+{
+ rec_t* version;
+ rec_t* prev_version;
+ dict_index_t* clust_index;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ dtuple_t* entry;
+ ulint err;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains(mtr, buf_block_align(rec),
+ MTR_MEMO_PAGE_S_FIX));
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+ mtr_s_lock(&(purge_sys->latch), mtr);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ if (also_curr && !rec_get_deleted_flag(rec)) {
+
+ heap = mem_heap_create(1024);
+ row = row_build(ROW_COPY_POINTERS, clust_index, rec, heap);
+ entry = row_build_index_entry(row, index, heap);
+
+ if (dtuple_datas_are_equal(ientry, entry)) {
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+
+ mem_heap_free(heap);
+ }
+
+ version = rec;
+ heap = NULL;
+
+ for (;;) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ err = trx_undo_prev_version_build(rec, mtr, version,
+ clust_index, heap,
+ &prev_version);
+ if (heap2) {
+ mem_heap_free(heap2); /* version was stored in heap2,
+ if heap2 != NULL */
+ }
+
+ if ((err != DB_SUCCESS) || !prev_version) {
+ /* Versions end here */
+
+ mem_heap_free(heap);
+
+ return(FALSE);
+ }
+
+ if (!rec_get_deleted_flag(prev_version)) {
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, heap);
+ entry = row_build_index_entry(row, index, heap);
+
+ if (dtuple_datas_are_equal(ientry, entry)) {
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+ }
+
+ version = prev_version;
+ }
+}
+
+/*********************************************************************
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version. */
+
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+ /* out: DB_SUCCESS or DB_MISSING_HISTORY */
+ rec_t* rec, /* in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /* in: mtr holding the latch on rec */
+ dict_index_t* index, /* in: the clustered index */
+ read_view_t* view, /* in: the consistent read view */
+ mem_heap_t* in_heap,/* in: memory heap from which the memory for
+ old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers)/* out, own: old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+{
+ rec_t* version;
+ rec_t* prev_version;
+ dulint prev_trx_id;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ byte* buf;
+ ulint err;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains(mtr, buf_block_align(rec),
+ MTR_MEMO_PAGE_S_FIX));
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+ ut_ad(!read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index)));
+
+ rw_lock_s_lock(&(purge_sys->latch));
+ version = rec;
+ heap = NULL;
+
+ for (;;) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ err = trx_undo_prev_version_build(rec, mtr, version, index,
+ heap, &prev_version);
+ if (heap2) {
+ mem_heap_free(heap2); /* version was stored in heap2,
+ if heap2 != NULL */
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ prev_trx_id = row_get_rec_trx_id(prev_version, index);
+
+ if (read_view_sees_trx_id(view, prev_trx_id)) {
+
+ /* The view already sees this version: we can copy
+ it to in_heap and return */
+
+ buf = mem_heap_alloc(in_heap, rec_get_size(
+ prev_version));
+ *old_vers = rec_copy(buf, prev_version);
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ version = prev_version;
+ }/* for (;;) */
+
+ mem_heap_free(heap);
+ rw_lock_s_unlock(&(purge_sys->latch));
+
+ return(err);
+}
diff --git a/innobase/row/ts/makefile b/innobase/row/ts/makefile
new file mode 100644
index 00000000000..589db50d4ed
--- /dev/null
+++ b/innobase/row/ts/makefile
@@ -0,0 +1,16 @@
+
+
+
+include ..\..\makefile.i
+
+tstcur: ..\tcur.lib tstcur.c
+ $(CCOM) $(CFL) -I.. -I..\.. ..\tcur.lib ..\..\trx.lib ..\..\btr.lib ..\..\fut.lib ..\..\fsp.lib ..\..\page.lib ..\..\dyn.lib ..\..\mtr.lib ..\..\log.lib ..\..\rem.lib ..\..\fil.lib ..\..\buf.lib ..\..\dict.lib ..\..\data.lib ..\..\mach.lib ..\..\ha.lib ..\..\ut.lib ..\..\sync.lib ..\..\mem.lib ..\..\os.lib tstcur.c $(LFL)
+
+
+
+
+
+
+
+
+
diff --git a/innobase/row/ts/tstcur.c b/innobase/row/ts/tstcur.c
new file mode 100644
index 00000000000..f5a5eb1f9f3
--- /dev/null
+++ b/innobase/row/ts/tstcur.c
@@ -0,0 +1,1087 @@
+/************************************************************************
+Test for the index system
+
+(c) 1994-1996 Innobase Oy
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "sync0sync.h"
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "os0file.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "mtr0mtr.h"
+#include "log0log.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "dict0boot.h"
+#include "trx0sys.h"
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "btr0pcur.h"
+#include "rem0rec.h"
+#include "..\tcur0ins.h"
+
+os_file_t files[1000];
+
+mutex_t ios_mutex;
+ulint ios;
+ulint n[10];
+
+mutex_t incs_mutex;
+ulint incs;
+
+byte bigbuf[1000000];
+
+#define N_SPACES 1
+#define N_FILES 1
+#define FILE_SIZE 4000 /* must be > 512 */
+#define POOL_SIZE 1000
+#define COUNTER_OFFSET 1500
+
+#define LOOP_SIZE 150
+#define N_THREADS 5
+
+
+ulint zero = 0;
+
+buf_block_t* bl_arr[POOL_SIZE];
+
+/************************************************************************
+Io-handler thread function. */
+
+ulint
+handler_thread(
+/*===========*/
+ void* arg)
+{
+ ulint segment;
+ void* mess;
+ ulint i;
+ bool ret;
+
+ segment = *((ulint*)arg);
+
+ printf("Io handler thread %lu starts\n", segment);
+
+ for (i = 0;; i++) {
+ ret = fil_aio_wait(segment, &mess);
+ ut_a(ret);
+
+ buf_page_io_complete((buf_block_t*)mess);
+
+ mutex_enter(&ios_mutex);
+ ios++;
+ mutex_exit(&ios_mutex);
+
+ }
+
+ return(0);
+}
+
+/*************************************************************************
+Creates the files for the file system test and inserts them to
+the file system. */
+
+void
+create_files(void)
+/*==============*/
+{
+ bool ret;
+ ulint i, k;
+ char name[20];
+ os_thread_t thr[5];
+ os_thread_id_t id[5];
+
+ printf("--------------------------------------------------------\n");
+ printf("Create or open database files\n");
+
+ strcpy(name, "tsfile00");
+
+ for (k = 0; k < N_SPACES; k++) {
+ for (i = 0; i < N_FILES; i++) {
+
+ name[6] = (char)((ulint)'0' + k);
+ name[7] = (char)((ulint)'0' + i);
+
+ files[i] = os_file_create(name, OS_FILE_CREATE,
+ OS_FILE_TABLESPACE, &ret);
+
+ if (ret == FALSE) {
+ ut_a(os_file_get_last_error() ==
+ OS_FILE_ALREADY_EXISTS);
+
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN,
+ OS_FILE_TABLESPACE, &ret);
+
+ ut_a(ret);
+ }
+
+ ret = os_file_close(files[i]);
+ ut_a(ret);
+
+ if (i == 0) {
+ fil_space_create(name, k, OS_FILE_TABLESPACE);
+ }
+
+ ut_a(fil_validate());
+
+ fil_node_create(name, FILE_SIZE, k);
+ }
+ }
+
+ ios = 0;
+
+ mutex_create(&ios_mutex);
+
+ for (i = 0; i < 5; i++) {
+ n[i] = i;
+
+ thr[i] = os_thread_create(handler_thread, n + i, id + i);
+ }
+}
+
+/************************************************************************
+Inits space header of space 0. */
+
+void
+init_space(void)
+/*============*/
+{
+ mtr_t mtr;
+
+ printf("Init space header\n");
+
+ mtr_start(&mtr);
+
+ fsp_header_init(0, FILE_SIZE * N_FILES, &mtr);
+
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Test for index page. */
+
+void
+test1(void)
+/*=======*/
+{
+ dtuple_t* tuple;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ ulint rnd = 0;
+ dict_index_t* index;
+ dict_table_t* table;
+ byte buf[16];
+ ulint i, j;
+ ulint tm, oldtm;
+ trx_t* trx;
+/* dict_tree_t* tree;*/
+ btr_pcur_t pcur;
+ btr_pcur_t pcur2;
+ mtr_t mtr;
+ mtr_t mtr2;
+ byte* field;
+ ulint len;
+ dtuple_t* search_tuple;
+ dict_tree_t* index_tree;
+ rec_t* rec;
+
+ UT_NOT_USED(len);
+ UT_NOT_USED(field);
+ UT_NOT_USED(pcur2);
+/*
+ printf("\n\n\nPress 2 x enter to start test\n");
+
+ while (EOF == getchar()) {
+
+ }
+
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 1. CREATE TABLE WITH 3 COLUMNS AND WITH 3 INDEXES\n");
+
+ heap = mem_heap_create(1024);
+ heap2 = mem_heap_create(1024);
+
+ trx = trx_start(ULINT_UNDEFINED);
+
+ table = dict_mem_table_create("TS_TABLE1", 0, 3);
+
+ dict_mem_table_add_col(table, "COL1", DATA_VARCHAR,
+ DATA_ENGLISH, 10, 0);
+ dict_mem_table_add_col(table, "COL2", DATA_VARCHAR,
+ DATA_ENGLISH, 10, 0);
+ dict_mem_table_add_col(table, "COL3", DATA_VARCHAR,
+ DATA_ENGLISH, 100, 0);
+
+ ut_a(TRUE == dict_create_table(table, trx));
+
+ index = dict_mem_index_create("TS_TABLE1", "IND1", 75046,
+ DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "COL1", 0);
+ dict_mem_index_add_field(index, "COL2", 0);
+
+ ut_a(mem_heap_validate(index->heap));
+
+ ut_a(TRUE == dict_create_index(index, trx));
+
+ trx_commit(trx);
+
+ trx = trx_start(ULINT_UNDEFINED);
+
+ index = dict_mem_index_create("TS_TABLE1", "IND2", 0, DICT_UNIQUE, 1);
+
+ dict_mem_index_add_field(index, "COL2", 0);
+
+ ut_a(mem_heap_validate(index->heap));
+
+ ut_a(TRUE == dict_create_index(index, trx));
+
+ trx_commit(trx);
+
+ trx = trx_start(ULINT_UNDEFINED);
+
+ index = dict_mem_index_create("TS_TABLE1", "IND3", 0, DICT_UNIQUE, 1);
+
+ dict_mem_index_add_field(index, "COL2", 0);
+
+ ut_a(mem_heap_validate(index->heap));
+
+ ut_a(TRUE == dict_create_index(index, trx));
+
+ trx_commit(trx);
+/*
+ tree = dict_index_get_tree(dict_table_get_first_index(table));
+
+ btr_print_tree(tree, 10);
+*/
+ dict_table_print(table);
+
+ /*---------------------------------------------------------*/
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 2. INSERT 1 ROW TO THE TABLE\n");
+
+ trx = trx_start(ULINT_UNDEFINED);
+
+ tuple = dtuple_create(heap, 3);
+
+ table = dict_table_get("TS_TABLE1", trx);
+
+ dtuple_gen_test_tuple3(tuple, 0, buf);
+ tcur_insert(tuple, table, heap2, trx);
+
+ trx_commit(trx);
+/*
+ tree = dict_index_get_tree(dict_table_get_first_index(table));
+
+ btr_print_tree(tree, 10);
+*/
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 3. INSERT MANY ROWS TO THE TABLE IN A SINGLE TRX\n");
+
+ rnd = 0;
+ oldtm = ut_clock();
+
+ trx = trx_start(ULINT_UNDEFINED);
+ for (i = 0; i < 300 * UNIV_DBC * UNIV_DBC; i++) {
+
+ if (i % 5000 == 0) {
+ /* dict_table_print(table);
+ buf_print();
+ buf_LRU_print();
+ printf("%lu rows inserted\n", i); */
+ }
+
+ table = dict_table_get("TS_TABLE1", trx);
+
+ if (i == 2180) {
+ rnd = rnd % 200000;
+ }
+
+ rnd = (rnd + 1) % 200000;
+
+ dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+ tcur_insert(tuple, table, heap2, trx);
+
+ mem_heap_empty(heap2);
+
+ if (i % 4 == 3) {
+ }
+ }
+ trx_commit(trx);
+
+ tm = ut_clock();
+ printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+ printf("%lu rows inserted\n", i);
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 4. PRINT PART OF CONTENTS OF EACH INDEX TREE\n");
+
+/*
+ mem_print_info();
+*/
+
+/*
+ tree = dict_index_get_tree(dict_table_get_first_index(table));
+
+ btr_print_tree(tree, 10);
+
+ tree = dict_index_get_tree(dict_table_get_next_index(
+ dict_table_get_first_index(table)));
+
+ btr_print_tree(tree, 5);
+*/
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+/* mem_print_info(); */
+
+ os_thread_sleep(5000000);
+
+ for (j = 0; j < 5; j++) {
+ printf("-------------------------------------------------\n");
+ printf("TEST 5. CALCULATE THE JOIN OF THE TABLE WITH ITSELF\n");
+
+ i = 0;
+
+ oldtm = ut_clock();
+
+ mtr_start(&mtr);
+
+ index_tree = dict_index_get_tree(UT_LIST_GET_FIRST(table->indexes));
+
+ search_tuple = dtuple_create(heap, 2);
+
+ dtuple_gen_search_tuple3(search_tuple, i, buf);
+
+ btr_pcur_open(index_tree, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ ut_a(btr_pcur_move_to_next(&pcur, &mtr));
+
+ while (!btr_pcur_is_after_last_in_tree(&pcur, &mtr)) {
+
+ if (i % 20000 == 0) {
+ printf("%lu rows joined\n", i);
+ }
+
+ index_tree = dict_index_get_tree(
+ UT_LIST_GET_FIRST(table->indexes));
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ rec_copy_prefix_to_dtuple(search_tuple, rec, 2, heap2);
+
+ mtr_start(&mtr2);
+
+ btr_pcur_open(index_tree, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur2, &mtr2);
+
+ btr_pcur_move_to_next(&pcur2, &mtr2);
+
+ rec = btr_pcur_get_rec(&pcur2);
+
+ field = rec_get_nth_field(rec, 1, &len);
+
+ ut_a(len == 8);
+
+ ut_a(ut_memcmp(field, dfield_get_data(
+ dtuple_get_nth_field(search_tuple, 1)),
+ len) == 0);
+
+ btr_pcur_close(&pcur2, &mtr);
+
+ mem_heap_empty(heap2);
+
+ mtr_commit(&mtr2);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ btr_pcur_move_to_next(&pcur, &mtr);
+ i++;
+ }
+
+ btr_pcur_close(&pcur, &mtr);
+ mtr_commit(&mtr);
+
+ tm = ut_clock();
+ printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+ printf("%lu rows joined\n", i);
+ }
+
+ oldtm = ut_clock();
+
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 6. INSERT MANY ROWS TO THE TABLE IN SEPARATE TRXS\n");
+
+ rnd = 200000;
+
+ for (i = 0; i < 350; i++) {
+
+ if (i % 4 == 0) {
+ }
+ trx = trx_start(ULINT_UNDEFINED);
+
+ table = dict_table_get("TS_TABLE1", trx);
+
+ if (i == 2180) {
+ rnd = rnd % 200000;
+ }
+
+ rnd = (rnd + 1) % 200000;
+
+ dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+ tcur_insert(tuple, table, heap2, trx);
+
+ trx_commit(trx);
+
+ mem_heap_empty(heap2);
+ if (i % 4 == 3) {
+ }
+ }
+
+ tm = ut_clock();
+ printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+ printf("%lu rows inserted in %lu transactions\n", i, i);
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 7. PRINT MEMORY ALLOCATION INFO\n");
+
+ mem_print_info();
+/*
+ printf("\n\n\nPress 2 x enter to continue test\n");
+
+ while (EOF == getchar()) {
+
+ }
+ getchar();
+*/
+ printf("-------------------------------------------------\n");
+ printf("TEST 8. PRINT SEMAPHORE INFO\n");
+
+ sync_print();
+
+
+
+#ifdef notdefined
+ rnd = 90000;
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+
+ mtr_start(&mtr);
+
+ if (i == 50000) {
+ rnd = rnd % 200000;
+ }
+
+ rnd = (rnd + 595659561) % 200000;
+
+ dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+ btr_pcur_open(tree, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &cursor, &mtr);
+
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+
+ rnd = 0;
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+
+ mtr_start(&mtr);
+
+ rnd = (rnd + 35608971) % 200000 + 1;
+
+ dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+
+/* btr_print_tree(tree, 3); */
+
+#endif
+ mem_heap_free(heap);
+}
+
+
+#ifdef notdefined
+
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ for (i = 0; i < 512; i++) {
+
+ rnd = (rnd + 534671) % 512;
+
+ if (i % 27 == 0) {
+ ut_a(page_validate(page, index));
+ }
+
+ dtuple_gen_test_tuple(tuple, rnd);
+
+/* dtuple_print(tuple);*/
+
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+
+ ut_a(rec);
+
+ rec_validate(rec);
+/* page_print_list(page, 151); */
+ }
+
+/* page_print_list(page, 151); */
+
+ ut_a(page_validate(page, index));
+ ut_a(page_get_n_recs(page) == 512);
+
+ for (i = 0; i < 512; i++) {
+
+ rnd = (rnd + 7771) % 512;
+
+ if (i % 27 == 0) {
+ ut_a(page_validate(page, index));
+ }
+
+ dtuple_gen_test_tuple(tuple, rnd);
+
+/* dtuple_print(tuple);*/
+
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ page_cur_delete_rec(&cursor, &mtr);
+
+ ut_a(rec);
+
+ rec_validate(rec);
+/* page_print_list(page, 151); */
+ }
+
+ ut_a(page_get_n_recs(page) == 0);
+
+ ut_a(page_validate(page, index));
+ page = page_create(frame, &mtr);
+
+ rnd = 311;
+
+ for (i = 0; i < 512; i++) {
+
+ rnd = (rnd + 1) % 512;
+
+ if (i % 27 == 0) {
+ ut_a(page_validate(page, index));
+ }
+
+ dtuple_gen_test_tuple(tuple, rnd);
+
+/* dtuple_print(tuple);*/
+
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+
+ ut_a(rec);
+
+ rec_validate(rec);
+/* page_print_list(page, 151); */
+ }
+
+ ut_a(page_validate(page, index));
+ ut_a(page_get_n_recs(page) == 512);
+
+ rnd = 217;
+
+ for (i = 0; i < 512; i++) {
+
+ rnd = (rnd + 1) % 512;
+
+ if (i % 27 == 0) {
+ ut_a(page_validate(page, index));
+ }
+
+ dtuple_gen_test_tuple(tuple, rnd);
+
+/* dtuple_print(tuple);*/
+
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ page_cur_delete_rec(&cursor, &mtr);
+
+ ut_a(rec);
+
+ rec_validate(rec);
+/* page_print_list(page, 151); */
+ }
+
+ ut_a(page_validate(page, index));
+ ut_a(page_get_n_recs(page) == 0);
+ page = page_create(frame, &mtr);
+
+ rnd = 291;
+
+ for (i = 0; i < 512; i++) {
+
+ rnd = (rnd - 1) % 512;
+
+ if (i % 27 == 0) {
+ ut_a(page_validate(page, index));
+ }
+
+ dtuple_gen_test_tuple(tuple, rnd);
+
+/* dtuple_print(tuple);*/
+
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+
+ ut_a(rec);
+
+ rec_validate(rec);
+/* page_print_list(page, 151); */
+ }
+
+ ut_a(page_validate(page, index));
+ ut_a(page_get_n_recs(page) == 512);
+
+ rnd = 277;
+
+ for (i = 0; i < 512; i++) {
+
+ rnd = (rnd - 1) % 512;
+
+ if (i % 27 == 0) {
+ ut_a(page_validate(page, index));
+ }
+
+ dtuple_gen_test_tuple(tuple, rnd);
+
+/* dtuple_print(tuple);*/
+
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ page_cur_delete_rec(&cursor, &mtr);
+
+ ut_a(rec);
+
+ rec_validate(rec);
+/* page_print_list(page, 151); */
+ }
+
+ ut_a(page_validate(page, index));
+ ut_a(page_get_n_recs(page) == 0);
+
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+}
+
+/*********************************************************************
+Test for index page. */
+
+void
+test2(void)
+/*=======*/
+{
+ page_t* page;
+ dtuple_t* tuple;
+ mem_heap_t* heap;
+ ulint i, j;
+ ulint rnd = 0;
+ rec_t* rec;
+ page_cur_t cursor;
+ dict_index_t* index;
+ dict_table_t* table;
+ buf_block_t* block;
+ buf_frame_t* frame;
+ ulint tm, oldtm;
+ byte buf[8];
+ mtr_t mtr;
+
+ printf("-------------------------------------------------\n");
+ printf("TEST 2. Speed test\n");
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+ ut_memcpy(bigbuf, bigbuf + 800, 800);
+ }
+
+ tm = ut_clock();
+ printf("Wall time for %lu mem copys of 800 bytes %lu millisecs\n",
+ i, tm - oldtm);
+
+ oldtm = ut_clock();
+
+ rnd = 0;
+ for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+ ut_memcpy(bigbuf + rnd, bigbuf + rnd + 800, 800);
+ rnd += 1600;
+ if (rnd > 995000) {
+ rnd = 0;
+ }
+ }
+
+ tm = ut_clock();
+ printf("Wall time for %lu mem copys of 800 bytes %lu millisecs\n",
+ i, tm - oldtm);
+
+ heap = mem_heap_create(0);
+
+ table = dict_table_create("TS_TABLE2", 2);
+
+ dict_table_add_col(table, "COL1", DATA_VARCHAR, DATA_ENGLISH, 10, 0);
+ dict_table_add_col(table, "COL2", DATA_VARCHAR, DATA_ENGLISH, 10, 0);
+
+ ut_a(0 == dict_table_publish(table));
+
+ index = dict_index_create("TS_TABLE2", "IND2", 0, 2, 0);
+
+ dict_index_add_field(index, "COL1", 0);
+ dict_index_add_field(index, "COL2", 0);
+
+ ut_a(0 == dict_index_publish(index));
+
+ index = dict_index_get("TS_TABLE2", "IND2");
+ ut_a(index);
+
+ tuple = dtuple_create(heap, 2);
+
+ oldtm = ut_clock();
+
+ rnd = 677;
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+ ut_a(rec);
+ }
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf("Wall time for insertion of %lu recs %lu milliseconds\n",
+ i * j, tm - oldtm);
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(0, 5, &mtr);
+ buf_page_s_lock(block, &mtr);
+
+ page = buf_block_get_frame(block);
+ ut_a(page_validate(page, index));
+ mtr_commit(&mtr);
+
+ oldtm = ut_clock();
+
+ rnd = 677;
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ }
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf(
+ "Wall time for %lu empty loops with page create %lu milliseconds\n",
+ i * j, tm - oldtm);
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ rnd = 100;
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 1) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+ ut_a(rec);
+ }
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf(
+ "Wall time for sequential insertion of %lu recs %lu milliseconds\n",
+ i * j, tm - oldtm);
+
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ rnd = 500;
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd - 1) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+ ut_a(rec);
+ }
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf(
+ "Wall time for descend. seq. insertion of %lu recs %lu milliseconds\n",
+ i * j, tm - oldtm);
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ rnd = 677;
+
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+ ut_a(rec);
+ }
+
+ rnd = 677;
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ page_cur_delete_rec(&cursor, &mtr);
+ }
+ ut_a(page_get_n_recs(page) == 0);
+
+ mtr_commit(&mtr);
+ }
+
+ tm = ut_clock();
+ printf("Wall time for insert and delete of %lu recs %lu milliseconds\n",
+ i * j, tm - oldtm);
+
+ mtr_start(&mtr);
+
+ block = buf_page_create(0, 5, &mtr);
+ buf_page_x_lock(block, &mtr);
+
+ frame = buf_block_get_frame(block);
+
+ page = page_create(frame, &mtr);
+
+ rnd = 677;
+
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+ rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+ ut_a(rec);
+ }
+ ut_a(page_validate(page, index));
+ mtr_print(&mtr);
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+ rnd = 677;
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+ }
+ }
+
+ tm = ut_clock();
+ printf("Wall time for search of %lu recs %lu milliseconds\n",
+ i * j, tm - oldtm);
+
+ oldtm = ut_clock();
+
+ for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+ rnd = 677;
+ for (j = 0; j < 250; j++) {
+ rnd = (rnd + 54841) % 1000;
+ dtuple_gen_test_tuple2(tuple, rnd, buf);
+ }
+ }
+
+ tm = ut_clock();
+ printf("Wall time for %lu empty loops %lu milliseconds\n",
+ i * j, tm - oldtm);
+ mtr_commit(&mtr);
+}
+
+#endif
+
+/********************************************************************
+Main test function. */
+
+void
+main(void)
+/*======*/
+{
+ ulint tm, oldtm;
+ mtr_t mtr;
+
+ sync_init();
+ mem_init();
+ os_aio_init(160, 5);
+ fil_init(25);
+ buf_pool_init(POOL_SIZE, POOL_SIZE);
+ fsp_init();
+ log_init();
+
+ create_files();
+ init_space();
+
+ mtr_start(&mtr);
+
+ trx_sys_create(&mtr);
+ dict_create(&mtr);
+
+ mtr_commit(&mtr);
+
+
+ oldtm = ut_clock();
+
+ ut_rnd_set_seed(19);
+
+ test1();
+
+/* mem_print_info(); */
+
+ tm = ut_clock();
+ printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+ printf("TESTS COMPLETED SUCCESSFULLY!\n");
+}