diff options
author | unknown <monty@donna.mysql.com> | 2001-02-17 14:19:19 +0200 |
---|---|---|
committer | unknown <monty@donna.mysql.com> | 2001-02-17 14:19:19 +0200 |
commit | 2662b59306ef0cd495fa6e2edf7129e58a11393a (patch) | |
tree | bfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/row | |
parent | 66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff) | |
download | mariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz |
Added Innobase to source distribution
Docs/manual.texi:
Added Innobase documentation
configure.in:
Incremented version
include/my_base.h:
Added option for Innobase
myisam/mi_check.c:
cleanup
mysql-test/t/bdb.test:
cleanup
mysql-test/t/innobase.test:
Extended with new tests from bdb.test
mysql-test/t/merge.test:
Added test of SHOW create
mysys/my_init.c:
Fix for UNIXWARE 7
scripts/mysql_install_db.sh:
Always write how to start mysqld
scripts/safe_mysqld.sh:
Fixed type
sql/ha_innobase.cc:
Update to new version
sql/ha_innobase.h:
Update to new version
sql/handler.h:
Added 'update_table_comment()' and 'append_create_info()'
sql/sql_delete.cc:
Fixes for Innobase
sql/sql_select.cc:
Fixes for Innobase
sql/sql_show.cc:
Append create information (for MERGE tables)
sql/sql_update.cc:
Fixes for Innobase
Diffstat (limited to 'innobase/row')
-rw-r--r-- | innobase/row/Makefile.am | 25 | ||||
-rw-r--r-- | innobase/row/makefilewin | 34 | ||||
-rw-r--r-- | innobase/row/row0ins.c | 1018 | ||||
-rw-r--r-- | innobase/row/row0mysql.c | 1116 | ||||
-rw-r--r-- | innobase/row/row0purge.c | 553 | ||||
-rw-r--r-- | innobase/row/row0row.c | 652 | ||||
-rw-r--r-- | innobase/row/row0sel.c | 2732 | ||||
-rw-r--r-- | innobase/row/row0uins.c | 308 | ||||
-rw-r--r-- | innobase/row/row0umod.c | 608 | ||||
-rw-r--r-- | innobase/row/row0undo.c | 313 | ||||
-rw-r--r-- | innobase/row/row0upd.c | 1394 | ||||
-rw-r--r-- | innobase/row/row0vers.c | 409 | ||||
-rw-r--r-- | innobase/row/ts/makefile | 16 | ||||
-rw-r--r-- | innobase/row/ts/tstcur.c | 1087 |
14 files changed, 10265 insertions, 0 deletions
diff --git a/innobase/row/Makefile.am b/innobase/row/Makefile.am new file mode 100644 index 00000000000..e4fcbe8f715 --- /dev/null +++ b/innobase/row/Makefile.am @@ -0,0 +1,25 @@ +# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB +# & Innobase Oy +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +include ../include/Makefile.i + +libs_LIBRARIES = librow.a + +librow_a_SOURCES = row0ins.c row0mysql.c row0purge.c row0row.c row0sel.c\ + row0uins.c row0umod.c row0undo.c row0upd.c row0vers.c + +EXTRA_PROGRAMS = diff --git a/innobase/row/makefilewin b/innobase/row/makefilewin new file mode 100644 index 00000000000..c17240c6119 --- /dev/null +++ b/innobase/row/makefilewin @@ -0,0 +1,34 @@ +include ..\include\makefile.i + +row.lib: row0mysql.obj row0upd.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj + lib -out:..\libs\row.lib row0mysql.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj + +row0mysql.obj: row0mysql.c + $(CCOM) $(CFL) -c row0mysql.c + +row0ins.obj: row0ins.c + $(CCOM) $(CFL) -c row0ins.c + +row0sel.obj: row0sel.c + $(CCOM) $(CFL) -c row0sel.c + +row0upd.obj: row0upd.c + $(CCOM) $(CFL) -c row0upd.c + +row0undo.obj: row0undo.c + $(CCOM) $(CFL) -c row0undo.c + +row0purge.obj: row0purge.c + $(CCOM) $(CFL) -c row0purge.c + +row0row.obj: row0row.c + $(CCOM) $(CFL) -c row0row.c + +row0vers.obj: row0vers.c + $(CCOM) $(CFL) -c row0vers.c + +row0umod.obj: row0umod.c + $(CCOM) $(CFL) -c row0umod.c + +row0uins.obj: row0uins.c + $(CCOM) $(CFL) -c row0uins.c diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c new file mode 100644 index 00000000000..4502cb8235f --- /dev/null +++ b/innobase/row/row0ins.c @@ -0,0 +1,1018 @@ +/****************************************************** +Insert into a table + +(c) 1996 Innobase Oy + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0ins.h" + +#ifdef UNIV_NONINL +#include "row0ins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "eval0eval.h" +#include "data0data.h" +#include "usr0sess.h" + +#define ROW_INS_PREV 1 +#define ROW_INS_NEXT 2 + +/************************************************************************* +Creates an insert node struct. */ + +ins_node_t* +ins_node_create( +/*============*/ + /* out, own: insert node struct */ + ulint ins_type, /* in: INS_VALUES, ... */ + dict_table_t* table, /* in: table where to insert */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + ins_node_t* node; + + node = mem_heap_alloc(heap, sizeof(ins_node_t)); + + node->common.type = QUE_NODE_INSERT; + + node->ins_type = ins_type; + + node->state = INS_NODE_SET_IX_LOCK; + node->table = table; + node->index = NULL; + node->entry = NULL; + + node->select = NULL; + + node->trx_id = ut_dulint_zero; + + node->entry_sys_heap = mem_heap_create(128); + + node->magic_n = INS_NODE_MAGIC_N; + + return(node); +} + +/*************************************************************** +Creates an entry template for each index of a table. */ +static +void +ins_node_create_entry_list( +/*=======================*/ + ins_node_t* node) /* in: row insert node */ +{ + dict_index_t* index; + dtuple_t* entry; + + ut_ad(node->entry_sys_heap); + + UT_LIST_INIT(node->entry_list); + + index = dict_table_get_first_index(node->table); + + while (index != NULL) { + entry = row_build_index_entry(node->row, index, + node->entry_sys_heap); + UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry); + + index = dict_table_get_next_index(index); + } +} + +/********************************************************************* +Adds system field buffers to a row. */ +static +void +row_ins_alloc_sys_fields( +/*=====================*/ + ins_node_t* node) /* in: insert node */ +{ + dtuple_t* row; + dict_table_t* table; + mem_heap_t* heap; + dict_col_t* col; + dfield_t* dfield; + ulint len; + byte* ptr; + + row = node->row; + table = node->table; + heap = node->entry_sys_heap; + + ut_ad(row && table && heap); + ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); + + /* 1. Allocate buffer for row id */ + + col = dict_table_get_sys_col(table, DATA_ROW_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN); + + node->row_id_buf = ptr; + + if (table->type == DICT_TABLE_CLUSTER_MEMBER) { + + /* 2. Fill in the dfield for mix id */ + + col = dict_table_get_sys_col(table, DATA_MIX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + len = mach_dulint_get_compressed_size(table->mix_id); + ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN); + + mach_dulint_write_compressed(ptr, table->mix_id); + dfield_set_data(dfield, ptr, len); + } + + /* 3. Allocate buffer for trx id */ + + col = dict_table_get_sys_col(table, DATA_TRX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN); + + node->trx_id_buf = ptr; + + /* 4. Allocate buffer for roll ptr */ + + col = dict_table_get_sys_col(table, DATA_ROLL_PTR); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + + dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN); +} + +/************************************************************************* +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ + +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /* in: insert node */ + dtuple_t* row) /* in: new row (or first row) for the node */ +{ + node->state = INS_NODE_SET_IX_LOCK; + node->index = NULL; + node->entry = NULL; + + node->row = row; + + mem_heap_empty(node->entry_sys_heap); + + /* Create templates for index entries */ + + ins_node_create_entry_list(node); + + /* Allocate from entry_sys_heap buffers for sys fields */ + + row_ins_alloc_sys_fields(node); + + /* As we allocated a new trx id buf, the trx id should be written + there again: */ + + node->trx_id = ut_dulint_zero; +} + +/*********************************************************************** +Does an insert operation by updating a delete marked existing record +in the index. This situation can occur if the delete marked record is +kept in the index for consistent reads. */ +static +ulint +row_ins_sec_index_entry_by_modify( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + btr_cur_t* cursor, /* in: B-tree cursor */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + ut_ad(((cursor->index)->type & DICT_CLUSTERED) == 0); + ut_ad(rec_get_deleted_flag(btr_cur_get_rec(cursor))); + + /* We just remove the delete mark from the secondary index record */ + err = btr_cur_del_mark_set_sec_rec(0, cursor, FALSE, thr, mtr); + + return(err); +} + +/*********************************************************************** +Does an insert operation by delete unmarking and updating a delete marked +existing record in the index. This situation can occur if the delete marked +record is kept in the index for consistent reads. */ +static +ulint +row_ins_clust_index_entry_by_modify( +/*================================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /* in: B-tree cursor */ + dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + mem_heap_t* heap; + rec_t* rec; + upd_t* update; + ulint err; + + ut_ad((cursor->index)->type & DICT_CLUSTERED); + + rec = btr_cur_get_rec(cursor); + + ut_ad(rec_get_deleted_flag(rec)); + + heap = mem_heap_create(1024); + + /* Build an update vector containing all the fields to be modified; + NOTE that this vector may contain also system columns! */ + + update = row_upd_build_difference(cursor->index, entry, rec, heap); + + if (mode == BTR_MODIFY_LEAF) { + /* Try optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(0, cursor, update, 0, thr, + mtr); + if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update(0, cursor, update, 0, thr, + mtr); + } + + mem_heap_free(heap); + + return(err); +} + +/******************************************************************* +Checks if a unique key violation to rec would occur at the index entry +insert. */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + /* out: TRUE if error */ + rec_t* rec, /* in: user record */ + dtuple_t* entry, /* in: entry to insert */ + dict_index_t* index, /* in: index */ + trx_t* trx) /* in: inserting transaction */ +{ + ulint matched_fields; + ulint matched_bytes; + ulint n_unique; + trx_t* impl_trx; + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + matched_bytes = 0; + + cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + if (!rec_get_deleted_flag(rec)) { + + return(TRUE); + } + + /* If we get here, the record has its delete mark set. It is still + a unique key violation if the transaction which set the delete mark + is currently active and is not trx itself. We check if some + transaction has an implicit x-lock on the record. */ + + mutex_enter(&kernel_mutex); + + if (index->type & DICT_CLUSTERED) { + impl_trx = lock_clust_rec_some_has_impl(rec, index); + } else { + impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + } + + mutex_exit(&kernel_mutex); + + if (impl_trx && impl_trx != trx) { + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************* +Scans a unique non-clustered index at a given index entry to determine +whether a uniqueness violation has occurred for the key value of the entry. */ +static +ulint +row_ins_scan_sec_index_for_duplicate( +/*=================================*/ + /* out: DB_SUCCESS or DB_DUPLICATE_KEY */ + dict_index_t* index, /* in: non-clustered unique index */ + dtuple_t* entry, /* in: index entry */ + trx_t* trx) /* in: inserting transaction */ +{ + ulint dupl_count = 0; + int cmp; + ulint n_fields_cmp; + rec_t* rec; + btr_pcur_t pcur; + mtr_t mtr; + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index)); + + btr_pcur_open_on_user_rec(index, entry, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + /* Scan index records and check that there are no duplicates */ + + for (;;) { + if (btr_pcur_is_after_last_in_tree(&pcur, &mtr)) { + + break; + } + + rec = btr_pcur_get_rec(&pcur); + + cmp = cmp_dtuple_rec(entry, rec); + + if (cmp == 0) { + if (row_ins_dupl_error_with_rec(rec, entry, index, + trx)) { + dupl_count++; + + if (dupl_count > 1) { + /* printf( + "Duplicate key in index %s\n", + index->name); + dtuple_print(entry); */ + } + } + } + + if (cmp < 0) { + break; + } + + ut_a(cmp == 0); + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + + ut_a(dupl_count >= 1); + + if (dupl_count > 1) { + + return(DB_DUPLICATE_KEY); + } + + return(DB_SUCCESS); +} + +/******************************************************************* +Tries to check if a unique key violation error would occur at an index entry +insert. */ +static +ulint +row_ins_duplicate_error( +/*====================*/ + /* out: DB_SUCCESS if no error + DB_DUPLICATE_KEY if error, + DB_STRONG_FAIL if this is a non-clustered + index record and we cannot determine yet + if there will be an error: in this last + case we must call + row_ins_scan_sec_index_for_duplicate + AFTER the insertion of the record! */ + btr_cur_t* cursor, /* in: B-tree cursor */ + dtuple_t* entry, /* in: entry to insert */ + trx_t* trx, /* in: inserting transaction */ + mtr_t* mtr, /* in: mtr */ + rec_t** dupl_rec)/* out: record with which duplicate error */ +{ + rec_t* rec; + page_t* page; + ulint n_unique; + + ut_ad(cursor->index->type & DICT_UNIQUE); + + /* NOTE: For unique non-clustered indexes there may be any number + of delete marked records with the same value for the non-clustered + index key (remember multiversioning), and which differ only in + the row refererence part of the index record, containing the + clustered index key fields. For such a secondary index record, + to avoid race condition, we must FIRST do the insertion and after + that check that the uniqueness condition is not breached! */ + + /* NOTE: A problem is that in the B-tree node pointers on an + upper level may match more to the entry than the actual existing + user records on the leaf level. So, even if low_match would suggest + that a duplicate key violation may occur, this may not be the case. */ + + n_unique = dict_index_get_n_unique(cursor->index); + + if (cursor->low_match >= n_unique) { + + rec = btr_cur_get_rec(cursor); + page = buf_frame_align(rec); + + if (rec != page_get_infimum_rec(page)) { + + if (row_ins_dupl_error_with_rec(rec, entry, + cursor->index, trx)) { + *dupl_rec = rec; + + return(DB_DUPLICATE_KEY); + } + } + } + + if (cursor->up_match >= n_unique) { + + rec = page_rec_get_next(btr_cur_get_rec(cursor)); + page = buf_frame_align(rec); + + if (rec != page_get_supremum_rec(page)) { + + if (row_ins_dupl_error_with_rec(rec, entry, + cursor->index, trx)) { + *dupl_rec = rec; + + return(DB_DUPLICATE_KEY); + } + } + + ut_a(!(cursor->index->type & DICT_CLUSTERED)); + /* This should never happen */ + } + + if (cursor->index->type & DICT_CLUSTERED) { + + return(DB_SUCCESS); + } + + /* It was a non-clustered index: we must scan the index after the + insertion to be sure if there will be duplicate key error */ + + return(DB_STRONG_FAIL); +} + +/******************************************************************* +Checks if an index entry has long enough common prefix with an existing +record so that the intended insert of the entry must be changed to a modify of +the existing record. In the case of a clustered index, the prefix must be +n_unique fields long, and in the case of a secondary index, all fields must be +equal. */ +UNIV_INLINE +ulint +row_ins_must_modify( +/*================*/ + /* out: 0 if no update, ROW_INS_PREV if + previous should be updated; currently we + do the search so that only the low_match + record can match enough to the search tuple, + not the next record */ + btr_cur_t* cursor) /* in: B-tree cursor */ +{ + ulint enough_match; + rec_t* rec; + page_t* page; + + /* NOTE: (compare to the note in row_ins_duplicate_error) Because node + pointers on upper levels of the B-tree may match more to entry than + to actual user records on the leaf level, we have to check if the + candidate record is actually a user record. In a clustered index + node pointers contain index->n_unique first fields, and in the case + of a secondary index, all fields of the index. */ + + enough_match = dict_index_get_n_unique_in_tree(cursor->index); + + if (cursor->low_match >= enough_match) { + + rec = btr_cur_get_rec(cursor); + page = buf_frame_align(rec); + + if (rec != page_get_infimum_rec(page)) { + + return(ROW_INS_PREV); + } + } + + return(0); +} + +/******************************************************************* +Tries to insert an index entry to an index. If the index is clustered +and a record with the same unique key is found, the other record is +necessarily marked deleted by a committed transaction, or a unique key +violation error occurs. The delete marked record is then updated to an +existing record, and we must write an undo log record on the delete +marked record. If the index is secondary, and a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. */ + +ulint +row_ins_index_entry_low( +/*====================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL + if pessimistic retry needed, or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr) /* in: query thread */ +{ + btr_cur_t cursor; + ulint dupl = DB_SUCCESS; + ulint modify; + rec_t* dummy_rec; + rec_t* rec; + rec_t* dupl_rec; /* Note that this may be undefined + for a non-clustered index even if + there is a duplicate key */ + ulint err; + ulint n_unique; + mtr_t mtr; + + log_free_check(); + mtr_start(&mtr); + + cursor.thr = thr; + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + mode | BTR_INSERT, &cursor, 0, &mtr); + + if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { + /* The insertion was made to the insert buffer already during + the search: we are done */ + + err = DB_SUCCESS; + + goto function_exit; + } + + n_unique = dict_index_get_n_unique(index); + + if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique + || cursor.low_match >= n_unique)) { + + dupl = row_ins_duplicate_error(&cursor, entry, + thr_get_trx(thr), &mtr, &dupl_rec); + if (dupl == DB_DUPLICATE_KEY) { + + /* printf("Duplicate key in index %s lm %lu\n", + cursor->index->name, cursor->low_match); + rec_print(rec); + dtuple_print(entry); */ + + err = dupl; + + goto function_exit; + } + } + + modify = row_ins_must_modify(&cursor); + + if (modify != 0) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + + if (modify == ROW_INS_NEXT) { + rec = page_rec_get_next(btr_cur_get_rec(&cursor)); + + btr_cur_position(index, rec, &cursor); + } + + if (index->type & DICT_CLUSTERED) { + err = row_ins_clust_index_entry_by_modify(mode, + &cursor, entry, + thr, &mtr); + } else { + err = row_ins_sec_index_entry_by_modify(&cursor, + thr, &mtr); + } + + } else if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert(0, &cursor, entry, + &dummy_rec, thr, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_insert(0, &cursor, entry, + &dummy_rec, thr, &mtr); + } +function_exit: + mtr_commit(&mtr); + + if (err == DB_SUCCESS && dupl == DB_STRONG_FAIL) { + /* We were not able to determine before the insertion + whether there will be a duplicate key error: do the check + now */ + + err = row_ins_scan_sec_index_for_duplicate(index, entry, + thr_get_trx(thr)); + } + + ut_ad(err != DB_DUPLICATE_KEY || index->type & DICT_CLUSTERED + || DB_DUPLICATE_KEY == + row_ins_scan_sec_index_for_duplicate(index, entry, + thr_get_trx(thr))); + return(err); +} + +/******************************************************************* +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. */ + +ulint +row_ins_index_entry( +/*================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DUPLICATE_KEY, or some other error code */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + /* Try first optimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr); + + if (err != DB_FAIL) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr); + + return(err); +} + +/*************************************************************** +Sets the values of the dtuple fields in entry from the values of appropriate +columns in row. */ +UNIV_INLINE +void +row_ins_index_entry_set_vals( +/*=========================*/ + dtuple_t* entry, /* in: index entry to make */ + dtuple_t* row) /* in: row */ +{ + dfield_t* field; + dfield_t* row_field; + ulint n_fields; + ulint i; + + ut_ad(entry && row); + + n_fields = dtuple_get_n_fields(entry); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(entry, i); + + row_field = dtuple_get_nth_field(row, field->col_no); + + field->data = row_field->data; + field->len = row_field->len; + } +} + +/*************************************************************** +Inserts a single index entry to the table. */ +UNIV_INLINE +ulint +row_ins_index_entry_step( +/*=====================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + ins_node_t* node, /* in: row insert node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(dtuple_check_typed(node->row)); + + row_ins_index_entry_set_vals(node->entry, node->row); + + ut_ad(dtuple_check_typed(node->entry)); + + err = row_ins_index_entry(node->index, node->entry, thr); + + return(err); +} + +/*************************************************************** +Allocates a row id for row and inits the node->index field. */ +UNIV_INLINE +void +row_ins_alloc_row_id_step( +/*======================*/ + ins_node_t* node) /* in: row insert node */ +{ + dulint row_id; + + ut_ad(node->state == INS_NODE_ALLOC_ROW_ID); + + if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) { + + /* No row id is stored if the clustered index is unique */ + + return; + } + + /* Fill in row id value to row */ + + row_id = dict_sys_get_new_row_id(); + + dict_sys_write_row_id(node->row_id_buf, row_id); +} + +/*************************************************************** +Gets a row to insert from the values list. */ +UNIV_INLINE +void +row_ins_get_row_from_values( +/*========================*/ + ins_node_t* node) /* in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->values_list; + + while (list_node) { + eval_exp(list_node); + + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/*************************************************************** +Gets a row to insert from the select list. */ +UNIV_INLINE +void +row_ins_get_row_from_select( +/*========================*/ + ins_node_t* node) /* in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->select->select_list; + + while (list_node) { + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/*************************************************************** +Inserts a row to a table. */ + +ulint +row_ins( +/*====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + ins_node_t* node, /* in: row insert node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + + if (node->state == INS_NODE_ALLOC_ROW_ID) { + + row_ins_alloc_row_id_step(node); + + node->index = dict_table_get_first_index(node->table); + node->entry = UT_LIST_GET_FIRST(node->entry_list); + + if (node->ins_type == INS_SEARCHED) { + + row_ins_get_row_from_select(node); + + } else if (node->ins_type == INS_VALUES) { + + row_ins_get_row_from_values(node); + } + + node->state = INS_NODE_INSERT_ENTRIES; + } + + ut_ad(node->state == INS_NODE_INSERT_ENTRIES); + + while (node->index != NULL) { + err = row_ins_index_entry_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + } + + ut_ad(node->entry == NULL); + + node->state = INS_NODE_ALLOC_ROW_ID; + + return(DB_SUCCESS); +} + +/*************************************************************** +Inserts a row to a table. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_ins_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ins_node_t* node; + que_node_t* parent; + sel_node_t* sel_node; + trx_t* trx; + ulint err; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); + + parent = que_node_get_parent(node); + sel_node = node->select; + + if (thr->prev_node == parent) { + node->state = INS_NODE_SET_IX_LOCK; + } + + /* If this is the first time this node is executed (or when + execution resumes after wait for the table IX lock), set an + IX lock on the table and reset the possible select node. */ + + if (node->state == INS_NODE_SET_IX_LOCK) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + if (UT_DULINT_EQ(trx->id, node->trx_id)) { + /* No need to do IX-locking or write trx id to buf */ + + goto same_trx; + } + + trx_write_trx_id(node->trx_id_buf, trx->id); + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + + node->trx_id = trx->id; + same_trx: + node->state = INS_NODE_ALLOC_ROW_ID; + + if (node->ins_type == INS_SEARCHED) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to insert */ + + thr->run_node = sel_node; + + return(thr); + } + } + + if ((node->ins_type == INS_SEARCHED) + && (sel_node->state != SEL_NODE_FETCH)) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to insert */ + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_ins(node, thr); + +error_handling: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->ins_type == INS_SEARCHED) { + /* Fetch a row to insert */ + + thr->run_node = sel_node; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c new file mode 100644 index 00000000000..13d84ffd358 --- /dev/null +++ b/innobase/row/row0mysql.c @@ -0,0 +1,1116 @@ +/****************************************************** +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +(c) 2000 Innobase Oy + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#include "row0ins.h" +#include "row0sel.h" +#include "row0upd.h" +#include "row0row.h" +#include "que0que.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "dict0crea.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "lock0lock.h" + +/*********************************************************************** +Reads a MySQL format variable-length field (like VARCHAR) length and +returns pointer to the field data. */ + +byte* +row_mysql_read_var_ref_noninline( +/*=============================*/ + /* out: field + 2 */ + ulint* len, /* out: variable-length field length */ + byte* field) /* in: field */ +{ + return(row_mysql_read_var_ref(len, field)); +} + +/*********************************************************************** +Stores a reference to a BLOB in the MySQL format. */ + +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /* in: where to store */ + ulint col_len, /* in: dest buffer size: determines into + how many bytes the BLOB length is stored, + this may vary from 1 to 4 bytes */ + byte* data, /* in: BLOB data */ + ulint len) /* in: BLOB length */ +{ + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + mach_write_to_n_little_endian(dest, col_len - 8, len); + + ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*)); +} + +/*********************************************************************** +Reads a reference to a BLOB in the MySQL format. */ + +byte* +row_mysql_read_blob_ref( +/*====================*/ + /* out: pointer to BLOB data */ + ulint* len, /* out: BLOB length */ + byte* ref, /* in: BLOB reference in the MySQL format */ + ulint col_len) /* in: BLOB reference length (not BLOB + length) */ +{ + byte* data; + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + ut_memcpy((byte*)&data, ref + col_len - 8, sizeof(byte*)); + + return(data); +} + +/****************************************************************** +Convert a row in the MySQL format to a row in the Innobase format. */ +static +void +row_mysql_convert_row_to_innobase( +/*==============================*/ + dtuple_t* row, /* in/out: Innobase row where the + field type information is already + copied there, or will be copied + later */ + byte* buf, /* in/out: buffer to use in converting + data in columns; this must be at least + the size of mysql_rec! */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct where template + must be of type ROW_MYSQL_WHOLE_ROW */ + byte* mysql_rec) /* in: row in the MySQL format; + NOTE: do not discard as long as + row is used, as row may contain + pointers to this record! */ +{ + mysql_row_templ_t* templ; + dfield_t* dfield; + ulint i; + + ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(prebuilt->mysql_template); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + dfield = dtuple_get_nth_field(row, i); + + if (templ->mysql_null_bit_mask != 0) { + /* Column may be SQL NULL */ + + if (mysql_rec[templ->mysql_null_byte_offset] & + (byte) (templ->mysql_null_bit_mask)) { + + /* It is SQL NULL */ + + dfield_set_data(dfield, NULL, UNIV_SQL_NULL); + + goto next_column; + } + } + + row_mysql_store_col_in_innobase_format(dfield, + prebuilt->ins_upd_rec_buff + + templ->mysql_col_offset, + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, + templ->type, templ->is_unsigned); +next_column: + ; + } +} + +/******************************************************************** +Handles user errors and lock waits detected by the database engine. */ + +ibool +row_mysql_handle_errors( +/*====================*/ + /* out: TRUE if it was a lock wait and + we should continue running the query thread */ + ulint* new_err,/* out: possible new error encountered in + rollback, or the old error which was + during the function entry */ + trx_t* trx, /* in: transaction */ + que_thr_t* thr, /* in: query thread */ + trx_savept_t* savept) /* in: savepoint */ +{ + ibool timeout_expired; + ulint err; + +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + if (err == DB_DUPLICATE_KEY) { + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + } else if (err == DB_TOO_BIG_RECORD) { + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + } else if (err == DB_LOCK_WAIT) { + + timeout_expired = srv_suspend_mysql_thread(thr); + + if (timeout_expired) { + trx->error_state = DB_DEADLOCK; + + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(TRUE); + + } else if (err == DB_DEADLOCK) { + + /* Roll back the whole transaction */ + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + } else if (err == DB_OUT_OF_FILE_SPACE) { + + /* Roll back the whole transaction */ + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + } else if (err == DB_MUST_GET_MORE_FILE_SPACE) { + + ut_a(0); /* TODO: print something to MySQL error log */ + } else { + ut_a(0); + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(FALSE); +} + +/************************************************************************ +Create a prebuilt struct for a MySQL table handle. */ + +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + /* out, own: a prebuilt struct */ + dict_table_t* table) /* in: Innobase table handle */ +{ + row_prebuilt_t* prebuilt; + mem_heap_t* heap; + dict_index_t* clust_index; + dtuple_t* ref; + ulint ref_len; + ulint i; + + heap = mem_heap_create(128); + + prebuilt = mem_heap_alloc(heap, sizeof(row_prebuilt_t)); + + prebuilt->table = table; + + prebuilt->trx = NULL; + + prebuilt->sql_stat_start = TRUE; + + prebuilt->index = NULL; + prebuilt->n_template = 0; + prebuilt->mysql_template = NULL; + + prebuilt->heap = heap; + prebuilt->ins_node = NULL; + + prebuilt->ins_upd_rec_buff = NULL; + + prebuilt->upd_node = NULL; + prebuilt->ins_graph = NULL; + prebuilt->upd_graph = NULL; + + prebuilt->pcur = btr_pcur_create_for_mysql(); + prebuilt->clust_pcur = btr_pcur_create_for_mysql(); + + prebuilt->select_lock_type = LOCK_NONE; + + prebuilt->sel_graph = NULL; + + prebuilt->search_tuple = dtuple_create(heap, + dict_table_get_n_cols(table)); + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + prebuilt->clust_ref = ref; + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + prebuilt->fetch_cache[i] = NULL; + } + + prebuilt->n_fetch_cached = 0; + + prebuilt->blob_heap = NULL; + + prebuilt->old_vers_heap = NULL; + + return(prebuilt); +} + +/************************************************************************ +Free a prebuilt struct for a MySQL table handle. */ + +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt) /* in, own: prebuilt struct */ +{ + ulint i; + + btr_pcur_free_for_mysql(prebuilt->pcur); + btr_pcur_free_for_mysql(prebuilt->clust_pcur); + + if (prebuilt->mysql_template) { + mem_free(prebuilt->mysql_template); + } + + if (prebuilt->ins_graph) { + que_graph_free_recursive(prebuilt->ins_graph); + } + + if (prebuilt->sel_graph) { + que_graph_free_recursive(prebuilt->sel_graph); + } + + if (prebuilt->upd_graph) { + que_graph_free_recursive(prebuilt->upd_graph); + } + + if (prebuilt->blob_heap) { + mem_heap_free(prebuilt->blob_heap); + } + + if (prebuilt->old_vers_heap) { + mem_heap_free(prebuilt->old_vers_heap); + } + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + if (prebuilt->fetch_cache[i] != NULL) { + mem_free(prebuilt->fetch_cache[i]); + } + } + + mem_heap_free(prebuilt->heap); +} + +/************************************************************************* +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ + +void +row_update_prebuilt_trx( +/*====================*/ + /* out: prebuilt dtuple */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + trx_t* trx) /* in: transaction handle */ +{ + prebuilt->trx = trx; + + if (prebuilt->ins_graph) { + prebuilt->ins_graph->trx = trx; + } + + if (prebuilt->upd_graph) { + prebuilt->upd_graph->trx = trx; + } + + if (prebuilt->sel_graph) { + prebuilt->sel_graph->trx = trx; + } +} + +/************************************************************************* +Gets pointer to a prebuilt dtuple used in insertions. If the insert graph +has not yet been built in the prebuilt struct, then this function first +builds it. */ +static +dtuple_t* +row_get_prebuilt_insert_row( +/*========================*/ + /* out: prebuilt dtuple */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + ins_node_t* node; + dtuple_t* row; + dict_table_t* table = prebuilt->table; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->ins_node == NULL) { + + /* Not called before for this handle: create an insert node + and query graph to the prebuilt struct */ + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == NULL) { + prebuilt->ins_upd_rec_buff = mem_heap_alloc( + prebuilt->heap, + prebuilt->mysql_row_len); + } + + row = dtuple_create(prebuilt->heap, + dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = + que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->ins_node->row); +} + +/************************************************************************* +Updates the table modification counter and calculates new estimates +for table and index statistics if necessary. */ +UNIV_INLINE +void +row_update_statistics_if_needed( +/*============================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct */ +{ + ulint counter; + ulint old_counter; + + counter = prebuilt->table->stat_modif_counter; + + counter += prebuilt->mysql_row_len; + prebuilt->table->stat_modif_counter = counter; + + old_counter = prebuilt->table->stat_last_estimate_counter; + + if (counter - old_counter >= DICT_STAT_CALCULATE_INTERVAL + || counter - old_counter >= + (UNIV_PAGE_SIZE + * prebuilt->table->stat_clustered_index_size / 2)) { + + dict_update_statistics(prebuilt->table); + } +} + +/************************************************************************* +Does an insert for MySQL. */ + +int +row_insert_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + row_mysql_convert_row_to_innobase(node->row, + prebuilt->ins_upd_rec_buff, + prebuilt, mysql_rec); + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + if (prebuilt->sql_stat_start) { + node->state = INS_NODE_SET_IX_LOCK; + prebuilt->sql_stat_start = FALSE; + } else { + node->state = INS_NODE_ALLOC_ROW_ID; + } + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + if (was_lock_wait) { + goto run_again; + } + + return(err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + prebuilt->table->stat_n_rows++; + + if (prebuilt->table->stat_n_rows == 0) { + /* Avoid wrap-over */ + prebuilt->table->stat_n_rows--; + } + + row_update_statistics_if_needed(prebuilt); + + return((int) err); +} + +/************************************************************************* +Builds a dummy query graph used in selects. */ + +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + sel_node_t* node; + + ut_ad(prebuilt && prebuilt->trx); + + if (prebuilt->sel_graph == NULL) { + + node = sel_node_create(prebuilt->heap); + + prebuilt->sel_graph = + que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + + prebuilt->sel_graph->state = QUE_FORK_ACTIVE; + } +} + +/************************************************************************* +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. */ + +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + /* out: prebuilt update vector */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + upd_node_t* node; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->upd_node == NULL) { + + /* Not called before for this handle: create an update node + and query graph to the prebuilt struct */ + + node = upd_node_create(prebuilt->heap); + + prebuilt->upd_node = node; + + node->in_mysql_interface = TRUE; + node->is_delete = FALSE; + node->searched_update = FALSE; + node->select_will_do_update = FALSE; + node->select = NULL; + node->pcur = btr_pcur_create_for_mysql(); + node->table = table; + + node->update = upd_create(dict_table_get_n_cols(table), + prebuilt->heap); + UT_LIST_INIT(node->columns); + node->has_clust_rec_x_lock = TRUE; + node->cmpl_info = 0; + + node->table_sym = NULL; + node->col_assign_list = NULL; + + prebuilt->upd_graph = + que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->upd_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->upd_node->update); +} + +/************************************************************************* +Does an update or delete of a row for MySQL. */ + +int +row_update_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + ulint err; + que_thr_t* thr; + ibool was_lock_wait; + dict_index_t* clust_index; + ulint ref_len; + upd_node_t* node; + dict_table_t* table = prebuilt->table; + trx_t* trx = prebuilt->trx; + mem_heap_t* heap; + dtuple_t* search_tuple; + dtuple_t* row_tuple; + mtr_t mtr; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + node = prebuilt->upd_node; + + clust_index = dict_table_get_first_index(table); + + if (prebuilt->in_update_remember_pos) { + if (prebuilt->index == clust_index) { + btr_pcur_copy_stored_position(node->pcur, + prebuilt->pcur); + } else { + btr_pcur_copy_stored_position(node->pcur, + prebuilt->clust_pcur); + } + + ut_ad(node->pcur->rel_pos == BTR_PCUR_ON); + + goto skip_cursor_search; + } + + /* We have to search for the correct cursor position */ + + ref_len = dict_index_get_n_unique(clust_index); + + heap = mem_heap_create(450); + + row_tuple = dtuple_create(heap, dict_table_get_n_cols(table)); + dict_table_copy_types(row_tuple, table); + + if (prebuilt->ins_upd_rec_buff == NULL) { + prebuilt->ins_upd_rec_buff = mem_heap_alloc(prebuilt->heap, + prebuilt->mysql_row_len); + } + + row_mysql_convert_row_to_innobase(row_tuple, + prebuilt->ins_upd_rec_buff, + prebuilt, mysql_rec); + + search_tuple = dtuple_create(heap, ref_len); + + row_build_row_ref_from_row(search_tuple, table, row_tuple); + + mtr_start(&mtr); + + btr_pcur_open_with_no_init(clust_index, search_tuple, PAGE_CUR_LE, + BTR_SEARCH_LEAF, node->pcur, 0, &mtr); + + btr_pcur_store_position(node->pcur, &mtr); + + mtr_commit(&mtr); + + mem_heap_free(heap); + +skip_cursor_search: + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->upd_graph); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + ut_ad(!prebuilt->sql_stat_start); + + que_thr_move_to_run_state_for_mysql(thr, trx); +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + + return((int) err); + } + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + if (was_lock_wait) { + goto run_again; + } + + return(err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (prebuilt->upd_node->is_delete) { + if (prebuilt->table->stat_n_rows > 0) { + prebuilt->table->stat_n_rows--; + } + } + + row_update_statistics_if_needed(prebuilt); + + return((int) err); +} + +/************************************************************************* +Checks if a table is such that we automatically created a clustered +index on it (on row id). */ + +ibool +row_table_got_default_clust_index( +/*==============================*/ + dict_table_t* table) +{ + dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(table); + + if (dtype_get_mtype(dict_index_get_nth_type(clust_index, 0)) + == DATA_SYS) { + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Does a table creation operation for MySQL. */ + +int +row_create_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table definition */ + trx_t* trx) /* in: transaction handle */ +{ + tab_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_enter(&(dict_sys->mutex)); + + heap = mem_heap_create(512); + + trx->dict_operation = TRUE; + + node = tab_create_graph_create(table, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr), + SESS_COMM_EXECUTE, 0)); + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + ut_a(err == DB_OUT_OF_FILE_SPACE); + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(table->name, trx, TRUE); + + trx->error_state = DB_SUCCESS; + } + + mutex_exit(&(dict_sys->mutex)); + que_graph_free((que_t*) que_node_get_parent(thr)); + + return((int) err); +} + +/************************************************************************* +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. */ + +int +row_create_index_for_mysql( +/*=======================*/ + /* out: error number or DB_SUCCESS */ + dict_index_t* index, /* in: index defintion */ + trx_t* trx) /* in: transaction handle */ +{ + ind_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_enter(&(dict_sys->mutex)); + + heap = mem_heap_create(512); + + trx->dict_operation = TRUE; + + node = ind_create_graph_create(index, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr), + SESS_COMM_EXECUTE, 0)); + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + ut_a(err == DB_OUT_OF_FILE_SPACE); + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(index->table_name, trx, TRUE); + + trx->error_state = DB_SUCCESS; + } + + mutex_exit(&(dict_sys->mutex)); + + que_graph_free((que_t*) que_node_get_parent(thr)); + + return((int) err); +} + +/************************************************************************* +Drops a table for MySQL. */ + +int +row_drop_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + char* name, /* in: table name */ + trx_t* trx, /* in: transaction handle */ + ibool has_dict_mutex) /* in: TRUE if the caller already owns the + dictionary system mutex */ +{ + dict_table_t* table; + que_thr_t* thr; + que_t* graph; + ulint err; + char* str1; + char* str2; + ulint len; + char buf[10000]; +retry: + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(name != NULL); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + str1 = + "PROCEDURE DROP_TABLE_PROC () IS\n" + "table_id CHAR;\n" + "index_id CHAR;\n" + "found INT;\n" + "BEGIN\n" + "SELECT ID INTO table_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME ='"; + + str2 = + "';\n" + "IF (SQL % NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO index_id\n" + " FROM SYS_INDEXES\n" + " WHERE TABLE_ID = table_id;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n" + " DELETE FROM SYS_INDEXES WHERE ID = index_id;\n" + " END IF;\n" + "END LOOP;\n" + "DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n" + "DELETE FROM SYS_TABLES WHERE ID = table_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + len = ut_strlen(str1); + + ut_memcpy(buf, str1, len); + ut_memcpy(buf + len, name, ut_strlen(name)); + + len += ut_strlen(name); + + ut_memcpy(buf + len, str2, ut_strlen(str2) + 1); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + if (!has_dict_mutex) { + mutex_enter(&(dict_sys->mutex)); + } + + graph = pars_sql(buf); + + ut_a(graph); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + /* Prevent purge from running while we are dropping the table */ + rw_lock_s_lock(&(purge_sys->purge_is_running)); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + /* Check if there are any locks on the table: if yes, it cannot + be dropped: we have to wait for the locks to be released */ + + if (lock_is_on_table(table)) { + + err = DB_TABLE_IS_BEING_USED; + + goto funct_exit; + } + + /* TODO: check that MySQL prevents users from accessing the table + after this function row_drop_table_for_mysql has been called: + otherwise anyone with an open handle to the table could, for example, + come to read the table! */ + + trx->dict_operation = TRUE; + trx->table_id = table->id; + + ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + ut_a(err == DB_OUT_OF_FILE_SPACE); + + err = DB_MUST_GET_MORE_FILE_SPACE; + + row_mysql_handle_errors(&err, trx, thr, NULL); + + ut_a(0); + } else { + dict_table_remove_from_cache(table); + } +funct_exit: + rw_lock_s_unlock(&(purge_sys->purge_is_running)); + + if (!has_dict_mutex) { + mutex_exit(&(dict_sys->mutex)); + } + + que_graph_free(graph); + + if (err == DB_TABLE_IS_BEING_USED) { + os_thread_sleep(200000); + + goto retry; + } + + return((int) err); +} + +/************************************************************************* +Renames a table for MySQL. */ + +int +row_rename_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + char* old_name, /* in: old table name */ + char* new_name, /* in: new table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + que_thr_t* thr; + que_t* graph; + ulint err; + char* str1; + char* str2; + char* str3; + ulint len; + char buf[10000]; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(old_name != NULL); + ut_a(new_name != NULL); + + str1 = + "PROCEDURE RENAME_TABLE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET NAME ='"; + + str2 = + "' WHERE NAME = '"; + + str3 = + "';\n" + "COMMIT WORK;\n" + "END;\n"; + + len = ut_strlen(str1); + + ut_memcpy(buf, str1, len); + + ut_memcpy(buf + len, new_name, ut_strlen(new_name)); + + len += ut_strlen(new_name); + + ut_memcpy(buf + len, str2, ut_strlen(str2)); + + len += ut_strlen(str2); + + ut_memcpy(buf + len, old_name, ut_strlen(old_name)); + + len += ut_strlen(old_name); + + ut_memcpy(buf + len, str3, ut_strlen(str3) + 1); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_enter(&(dict_sys->mutex)); + + table = dict_table_get_low(old_name); + + graph = pars_sql(buf); + + ut_a(graph); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + row_mysql_handle_errors(&err, trx, thr, NULL); + } else { + ut_a(dict_table_rename_in_cache(table, new_name)); + } +funct_exit: + mutex_exit(&(dict_sys->mutex)); + + que_graph_free(graph); + + return((int) err); +} diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c new file mode 100644 index 00000000000..0a6fabe584c --- /dev/null +++ b/innobase/row/row0purge.c @@ -0,0 +1,553 @@ +/****************************************************** +Purge obsolete records + +(c) 1997 Innobase Oy + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#include "row0purge.h" + +#ifdef UNIV_NONINL +#include "row0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0vers.h" +#include "log0log.h" + +/************************************************************************ +Creates a purge node to a query graph. */ + +purge_node_t* +row_purge_node_create( +/*==================*/ + /* out, own: purge node */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + purge_node_t* node; + + ut_ad(parent && heap); + + node = mem_heap_alloc(heap, sizeof(purge_node_t)); + + node->common.type = QUE_NODE_PURGE; + node->common.parent = parent; + + node->heap = mem_heap_create(256); + + return(node); +} + +/*************************************************************** +Repositions the pcur in the purge node on the clustered index record, +if found. */ +static +ibool +row_purge_reposition_pcur( +/*======================*/ + /* out: TRUE if the record was found */ + ulint mode, /* in: latching mode */ + purge_node_t* node, /* in: row purge node */ + mtr_t* mtr) /* in: mtr */ +{ + ibool found; + + if (node->found_clust) { + found = btr_pcur_restore_position(mode, &(node->pcur), mtr); + + return(found); + } + + found = row_search_on_row_ref(&(node->pcur), mode, node->table, + node->ref, mtr); + node->found_clust = found; + + if (found) { + btr_pcur_store_position(&(node->pcur), mtr); + } + + return(found); +} + +/*************************************************************** +Removes a delete marked clustered index record if possible. */ +static +ibool +row_purge_remove_clust_if_poss_low( +/*===============================*/ + /* out: TRUE if success, or if not found, or + if modified after the delete marking */ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr, /* in: query thread */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ibool success; + ulint err; + mtr_t mtr; + + UT_NOT_USED(thr); + + index = dict_table_get_first_index(node->table); + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + mtr_start(&mtr); + + success = row_purge_reposition_pcur(mode, node, &mtr); + + if (!success) { + /* The record is already removed */ + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + if (0 != ut_dulint_cmp(node->roll_ptr, + row_get_rec_roll_ptr(btr_pcur_get_rec(pcur), index))) { + + /* Someone else has modified the record later: do not remove */ + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr); + + if (err == DB_SUCCESS) { + success = TRUE; + } else if (err == DB_OUT_OF_FILE_SPACE) { + success = FALSE; + } else { + ut_a(0); + } + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(success); +} + +/*************************************************************** +Removes a clustered index record if it has not been modified after the delete +marking. */ +static +void +row_purge_remove_clust_if_poss( +/*===========================*/ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr) /* in: query thread */ +{ + ibool success; + ulint n_tries = 0; + +/* printf("Purge: Removing clustered record\n"); */ + + success = row_purge_remove_clust_if_poss_low(node, thr, + BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_clust_if_poss_low(node, thr, + BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/*************************************************************** +Removes a secondary index entry if possible. */ +static +ibool +row_purge_remove_sec_if_poss_low( +/*=============================*/ + /* out: TRUE if success or if not found */ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode) /* in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has; + ibool found; + ulint err; + mtr_t mtr; + mtr_t mtr_vers; + + UT_NOT_USED(thr); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + if (!found) { + /* Not found */ + + /* FIXME: printf("PURGE:........sec entry not found\n"); */ + /* dtuple_print(entry); */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(TRUE); + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* We should remove the index record if no later version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should do nothing. */ + + mtr_start(&mtr_vers); + + success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers); + + if (success) { + old_has = row_vers_old_has_index_entry(TRUE, + btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + + if (!success || !old_has) { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr); + + if (err == DB_SUCCESS) { + success = TRUE; + } else if (err == DB_OUT_OF_FILE_SPACE) { + success = FALSE; + } else { + ut_a(0); + } + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(success); +} + +/*************************************************************** +Removes a secondary index entry if possible. */ +UNIV_INLINE +void +row_purge_remove_sec_if_poss( +/*=========================*/ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + ibool success; + ulint n_tries = 0; + +/* printf("Purge: Removing secondary record\n"); */ + + success = row_purge_remove_sec_if_poss_low(node, thr, index, entry, + BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_sec_if_poss_low(node, thr, index, entry, + BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/*************************************************************** +Purges a delete marking of a record. */ +static +void +row_purge_del_mark( +/*===============*/ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + + ut_ad(node && thr); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + /* Build the index entry */ + entry = row_build_index_entry(node->row, index, heap); + + row_purge_remove_sec_if_poss(node, thr, index, entry); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + row_purge_remove_clust_if_poss(node, thr); +} + +/*************************************************************** +Purges an update of an existing record. */ +static +void +row_purge_upd_exist( +/*================*/ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + + ut_ad(node && thr); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field(NULL, node->index, + node->update)) { + /* Build the older version of the index entry */ + entry = row_build_index_entry(node->row, index, heap); + + row_purge_remove_sec_if_poss(node, thr, index, entry); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); +} + +/*************************************************************** +Parses the row reference and other info in a modify undo log record. */ +static +ibool +row_purge_parse_undo_rec( +/*=====================*/ + /* out: TRUE if purge operation required */ + purge_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + + ut_ad(node && thr); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + &undo_no, &table_id); + node->rec_type = type; + + if (type == TRX_UNDO_UPD_DEL_REC) { + + return(FALSE); + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + node->table = NULL; + + if (type == TRX_UNDO_UPD_EXIST_REC + && cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + /* Purge requires no changes to indexes: we may return */ + + return(FALSE); + } + + /* NOTE that the table has to be explicitly released later */ + + /* TODO: currently nothing prevents dropping of table when purge + is accessing it! */ + + mutex_enter(&(dict_sys->mutex)); + + node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr)); + + rw_lock_x_lock(&(purge_sys->purge_is_running)); + + mutex_exit(&(dict_sys->mutex)); + + if (node->table == NULL) { + /* The table has been dropped: no need to do purge */ + + rw_lock_x_unlock(&(purge_sys->purge_is_running)); + + return(FALSE); + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, node->heap, + &(node->update)); + + /* Read to the partial row the fields that occur in indexes */ + + ptr = trx_undo_rec_get_partial_row(ptr, clust_index, &(node->row), + node->heap); + return(TRUE); +} + +/*************************************************************** +Fetches an undo log record and does the purge for the recorded operation. +If none left, or the current purge completed, returns the control to the +parent node, which is always a query thread node. */ +static +ulint +row_purge( +/*======*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr) /* in: query thread */ +{ + dulint roll_ptr; + ibool purge_needed; + + ut_ad(node && thr); + + node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr, + &(node->reservation), + node->heap); + if (!node->undo_rec) { + /* Purge completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + + if (node->undo_rec == &trx_purge_dummy_rec) { + purge_needed = FALSE; + } else { + purge_needed = row_purge_parse_undo_rec(node, thr); + } + + if (purge_needed) { + node->found_clust = FALSE; + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + row_purge_upd_exist(node, thr); + } else { + ut_ad(node->rec_type == TRX_UNDO_DEL_MARK_REC); + row_purge_del_mark(node, thr); + } + + if (node->found_clust) { + btr_pcur_close(&(node->pcur)); + } + + rw_lock_x_unlock(&(purge_sys->purge_is_running)); + } + + /* Do some cleanup */ + trx_purge_rec_release(node->reservation); + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(DB_SUCCESS); +} + +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. */ + +que_thr_t* +row_purge_step( +/*===========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + purge_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + + err = row_purge(node, thr); + + ut_ad(err == DB_SUCCESS); + + return(thr); +} diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c new file mode 100644 index 00000000000..f85789fa0d6 --- /dev/null +++ b/innobase/row/row0row.c @@ -0,0 +1,652 @@ +/****************************************************** +General row routines + +(c) 1996 Innobase Oy + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" + +#ifdef UNIV_NONINL +#include "row0row.ic" +#endif + +#include "dict0dict.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" + +/************************************************************************* +Reads the trx id or roll ptr field from a clustered index record: this function +is slower than the specialized inline functions. */ + +dulint +row_get_rec_sys_field( +/*==================*/ + /* out: value of the field */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + rec_t* rec, /* in: record */ + dict_index_t* index) /* in: clustered index */ +{ + ulint pos; + byte* field; + ulint len; + + ut_ad(index->type & DICT_CLUSTERED); + + pos = dict_index_get_sys_col_pos(index, type); + + field = rec_get_nth_field(rec, pos, &len); + + if (type == DATA_TRX_ID) { + + return(trx_read_trx_id(field)); + } else { + ut_ad(type == DATA_ROLL_PTR); + + return(trx_read_roll_ptr(field)); + } +} + +/************************************************************************* +Sets the trx id or roll ptr field in a clustered index record: this function +is slower than the specialized inline functions. */ + +void +row_set_rec_sys_field( +/*==================*/ + /* out: value of the field */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + dulint val) /* in: value to set */ +{ + ulint pos; + byte* field; + ulint len; + + ut_ad(index->type & DICT_CLUSTERED); + + pos = dict_index_get_sys_col_pos(index, type); + + field = rec_get_nth_field(rec, pos, &len); + + if (type == DATA_TRX_ID) { + + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + + trx_write_roll_ptr(field, val); + } +} + +/********************************************************************* +When an insert to a table is performed, this function builds the entry which +has to be inserted to an index on the table. */ + +dtuple_t* +row_build_index_entry( +/*==================*/ + /* out: index entry which should be inserted */ + dtuple_t* row, /* in: row which should be inserted to the + table */ + dict_index_t* index, /* in: index on the table */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the index entry is allocated */ +{ + dtuple_t* entry; + ulint entry_len; + dict_field_t* ind_field; + dfield_t* dfield; + dfield_t* dfield2; + dict_col_t* col; + ulint i; + + ut_ad(row && index && heap); + ut_ad(dtuple_check_typed(row)); + + entry_len = dict_index_get_n_fields(index); + entry = dtuple_create(heap, entry_len); + + if (index->type & DICT_UNIVERSAL) { + dtuple_set_n_fields_cmp(entry, entry_len); + } else { + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + } + + for (i = 0; i < entry_len; i++) { + ind_field = dict_index_get_nth_field(index, i); + col = ind_field->col; + + dfield = dtuple_get_nth_field(entry, i); + + dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_copy(dfield, dfield2); + dfield->col_no = dict_col_get_no(col); + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*********************************************************************** +An inverse function to dict_row_build_index_entry. Builds a row from a +record in a clustered index. */ + +dtuple_t* +row_build( +/*======*/ + /* out, own: row built; see the NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap as the latter only places pointers to + data fields on the index page, and thus is + more efficient */ + dict_index_t* index, /* in: clustered index */ + rec_t* rec, /* in: record in the clustered index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row dtuple is used! */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dtuple_t* row; + dict_table_t* table; + ulint n_fields; + ulint i; + dfield_t* dfield; + byte* field; + ulint len; + ulint row_len; + dict_col_t* col; + byte* buf; + + ut_ad(index && rec && heap); + ut_ad(index->type & DICT_CLUSTERED); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_get_size(rec)); + rec = rec_copy(buf, rec); + } + + table = index->table; + row_len = dict_table_get_n_cols(table); + + row = dtuple_create(heap, row_len); + + dtuple_set_info_bits(row, rec_get_info_bits(rec)); + + n_fields = dict_index_get_n_fields(index); + + ut_ad(n_fields == rec_get_n_fields(rec)); + + dict_table_copy_types(row, table); + + for (i = 0; i < n_fields; i++) { + + col = dict_field_get_col(dict_index_get_nth_field(index, i)); + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + field = rec_get_nth_field(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + + ut_ad(dtuple_check_typed(row)); + + return(row); +} + +/*********************************************************************** +An inverse function to dict_row_build_index_entry. Builds a row from a +record in a clustered index. */ + +void +row_build_to_tuple( +/*===============*/ + dtuple_t* row, /* in/out: row built; see the NOTE below! */ + dict_index_t* index, /* in: clustered index */ + rec_t* rec) /* in: record in the clustered index; + NOTE: the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row dtuple is used! */ +{ + dict_table_t* table; + ulint n_fields; + ulint i; + dfield_t* dfield; + byte* field; + ulint len; + ulint row_len; + dict_col_t* col; + + ut_ad(index && rec); + ut_ad(index->type & DICT_CLUSTERED); + + table = index->table; + row_len = dict_table_get_n_cols(table); + + dtuple_set_info_bits(row, rec_get_info_bits(rec)); + + n_fields = dict_index_get_n_fields(index); + + ut_ad(n_fields == rec_get_n_fields(rec)); + + dict_table_copy_types(row, table); + + for (i = 0; i < n_fields; i++) { + + col = dict_field_get_col(dict_index_get_nth_field(index, i)); + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + field = rec_get_nth_field(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + + ut_ad(dtuple_check_typed(row)); +} + +/*********************************************************************** +Converts an index record to a typed data tuple. */ + +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + /* out, own: index entry built; see the + NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap as the latter only places pointers to + data fields on the index page */ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the dtuple is used! */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + ulint i; + byte* field; + ulint len; + ulint rec_len; + byte* buf; + + ut_ad(rec && heap && index); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_get_size(rec)); + rec = rec_copy(buf, rec); + } + + rec_len = rec_get_n_fields(rec); + + entry = dtuple_create(heap, rec_len); + + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + ut_ad(rec_len == dict_index_get_n_fields(index)); + + dict_index_copy_types(entry, index, rec_len); + + dtuple_set_info_bits(entry, rec_get_info_bits(rec)); + + for (i = 0; i < rec_len; i++) { + + dfield = dtuple_get_nth_field(entry, i); + field = rec_get_nth_field(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ + +dtuple_t* +row_build_row_ref( +/*==============*/ + /* out, own: row reference built; see the + NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dict_col_t* col; + dtuple_t* ref; + byte* field; + ulint len; + ulint ref_len; + ulint pos; + byte* buf; + ulint i; + + ut_ad(index && rec && heap); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + + buf = mem_heap_alloc(heap, rec_get_size(rec)); + + rec = rec_copy(buf, rec); + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + col = dict_field_get_col( + dict_index_get_nth_field(clust_index, i)); + pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col)); + + if (pos != ULINT_UNDEFINED) { + field = rec_get_nth_field(rec, pos, &len); + + dfield_set_data(dfield, field, len); + } else { + ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER); + ut_ad(i == table->mix_len); + + dfield_set_data(dfield, + mem_heap_alloc(heap, table->mix_id_len), + table->mix_id_len); + ut_memcpy(dfield_get_data(dfield), table->mix_id_buf, + table->mix_id_len); + } + } + + ut_ad(dtuple_check_typed(ref)); + + return(ref); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ + +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; see the + NOTE below! */ + dict_index_t* index, /* in: index */ + rec_t* rec) /* in: record in the index; + NOTE: the data fields in ref will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dict_col_t* col; + byte* field; + ulint len; + ulint ref_len; + ulint pos; + ulint i; + + ut_ad(ref && index && rec); + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + col = dict_field_get_col( + dict_index_get_nth_field(clust_index, i)); + pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col)); + + if (pos != ULINT_UNDEFINED) { + field = rec_get_nth_field(rec, pos, &len); + + dfield_set_data(dfield, field, len); + } else { + ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER); + ut_ad(i == table->mix_len); + ut_a(0); + } + } + + ut_ad(dtuple_check_typed(ref)); +} + +/*********************************************************************** +From a row build a row reference with which we can search the clustered +index record. */ + +void +row_build_row_ref_from_row( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; see the + NOTE below! ref must have the right number + of fields! */ + dict_table_t* table, /* in: table */ + dtuple_t* row) /* in: row + NOTE: the data fields in ref will point + directly into data of this row */ +{ + dict_index_t* clust_index; + dfield_t* dfield; + dfield_t* dfield2; + dict_col_t* col; + ulint ref_len; + ulint i; + + ut_ad(ref && table && row); + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + col = dict_field_get_col( + dict_index_get_nth_field(clust_index, i)); + + dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_copy(dfield, dfield2); + } + + ut_ad(dtuple_check_typed(ref)); +} + +/******************************************************************* +Searches the clustered index record for a row, if we have the row reference. */ + +ibool +row_search_on_row_ref( +/*==================*/ + /* out: TRUE if found */ + btr_pcur_t* pcur, /* in/out: persistent cursor, which must + be closed by the caller */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + dict_table_t* table, /* in: table */ + dtuple_t* ref, /* in: row reference */ + mtr_t* mtr) /* in: mtr */ +{ + ulint low_match; + rec_t* rec; + dict_index_t* index; + page_t* page; + + ut_ad(dtuple_check_typed(ref)); + + index = dict_table_get_first_index(table); + + btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr); + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + page = buf_frame_align(rec); + + if (rec == page_get_infimum_rec(page)) { + + return(FALSE); + } + + if (low_match != dtuple_get_n_fields(ref)) { + + return(FALSE); + } + + return(TRUE); +} + +/************************************************************************* +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. */ + +rec_t* +row_get_clust_rec( +/*==============*/ + /* out: record or NULL, if no record found */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: secondary index */ + dict_index_t** clust_index,/* out: clustered index */ + mtr_t* mtr) /* in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* ref; + dict_table_t* table; + btr_pcur_t pcur; + ibool found; + rec_t* clust_rec; + + ut_ad((index->type & DICT_CLUSTERED) == 0); + + table = index->table; + + heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap); + + found = row_search_on_row_ref(&pcur, mode, table, ref, mtr); + + clust_rec = btr_pcur_get_rec(&pcur); + + mem_heap_free(heap); + + btr_pcur_close(&pcur); + + *clust_index = dict_table_get_first_index(table); + + if (!found) { + + return(NULL); + } + + return(clust_rec); +} + +/******************************************************************* +Searches an index record. */ + +ibool +row_search_index_entry( +/*===================*/ + /* out: TRUE if found */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /* in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /* in: mtr */ +{ + ulint n_fields; + ulint low_match; + page_t* page; + rec_t* rec; + + ut_ad(dtuple_check_typed(entry)); + + btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr); + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + page = buf_frame_align(rec); + + n_fields = dtuple_get_n_fields(entry); + + if (rec == page_get_infimum_rec(page)) { + + return(FALSE); + } + + if (low_match != n_fields) { + /* Not found */ + + return(FALSE); + } + + return(TRUE); +} diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c new file mode 100644 index 00000000000..bd7af5743d8 --- /dev/null +++ b/innobase/row/row0sel.c @@ -0,0 +1,2732 @@ +/******************************************************* +Select + +(c) 1997 Innobase Oy + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" + +#ifdef UNIV_NONINL +#include "row0sel.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/************************************************************************* +Creates a select node struct. */ + +sel_node_t* +sel_node_create( +/*============*/ + /* out, own: select node struct */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + sel_node_t* node; + + node = mem_heap_alloc(heap, sizeof(sel_node_t)); + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->select_will_do_update = FALSE; + node->latch_mode = BTR_SEARCH_LEAF; + + node->plans = NULL; + + return(node); +} + +/************************************************************************* +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ + +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /* in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/************************************************************************* +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/************************************************************************* +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /* in: first variable in a list of variables */ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + exp = node->select_list; + + while (var) { + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + var = que_node_get_next(var); + } +} + +/************************************************************************* +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /* in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + func_node = node->select_list; + + while (func_node) { + eval_node_set_int_val(func_node, 0); + + func_node = que_node_get_next(func_node); + } + + node->aggregate_already_fetched = FALSE; +} + +/************************************************************************* +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /* in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/************************************************************************* +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /* in: record index */ + rec_t* rec, /* in: record in a clustered or non-clustered + index */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + byte* data; + ulint len; + + if (index->type & DICT_CLUSTERED) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + data = rec_get_nth_field(rec, field_no, &len); + + if (column->copy_val) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /* in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH + * sizeof(sel_buf_t)); + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + + sel_buf->val_buf_size = 0; + } +} + +/************************************************************************* +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ + +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + mem_free(sel_buf->data); + } + } +} + +/************************************************************************* +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_pop_prefetched_row( +/*===================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); +#ifdef UNIV_DEBUG + dfield_set_data(val, NULL, 0); +#endif + goto next_col; + } + + ut_ad(column->prefetch_buf); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = dfield_get_data(val); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/************************************************************************* +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_push_prefetched_row( +/*====================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + + goto next_col; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = dfield_get_data(val); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers( +/*====================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a clustered index */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (plan->old_vers_heap) { + mem_heap_empty(plan->old_vers_heap); + } else { + plan->old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read(rec, mtr, plan->index, + read_view, plan->old_vers_heap, + old_vers); + return(err); +} + +/************************************************************************* +Tests the conditions which determine when the index segment we are searching +through has been exhausted. */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(cond->args); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Tests the other conditions. */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. */ +static +ulint +row_sel_get_clust_rec( +/*==================*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select_node */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a non-clustered index */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec); + + index = dict_table_get_first_index(plan->table); + + btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE, + node->latch_mode, &(plan->clust_pcur), + 0, mtr); + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + ut_ad(page_rec_is_user_rec(clust_rec)); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + + err = lock_clust_rec_read_check_and_lock(0, clust_rec, index, + node->row_lock_mode, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (!lock_clust_rec_cons_read_sees(clust_rec, index, + node->read_view)) { + + err = row_sel_build_prev_vers(node->read_view, plan, + clust_rec, &old_vers, mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + *out_rec = clust_rec; + + return(DB_SUCCESS); + } + } + } + + /* Fetch the columns needed in test conditions */ + + row_sel_fetch_columns(index, clust_rec, + UT_LIST_GET_FIRST(plan->columns)); + *out_rec = clust_rec; + + return(DB_SUCCESS); +} + +/************************************************************************* +Sets a lock on a record. */ +UNIV_INLINE +ulint +sel_set_rec_lock( +/*=============*/ + /* out: DB_SUCCESS or error code */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + ulint mode, /* in: lock mode */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + if (index->type & DICT_CLUSTERED) { + err = lock_clust_rec_read_check_and_lock(0, rec, index, mode, + thr); + } else { + err = lock_sec_rec_read_check_and_lock(0, rec, index, mode, + thr); + } + + return(err); +} + +/************************************************************************* +Opens a pcur to a table index. */ +static +void +row_sel_open_pcur( +/*==============*/ + sel_node_t* node, /* in: select node */ + plan_t* plan, /* in: table plan */ + ibool search_latch_locked, + /* in: TRUE if the thread currently + has the search latch locked in + s-mode */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint has_search_latch = 0; /* RW_S_LATCH or 0 */ + ulint i; + + if (search_latch_locked) { + has_search_latch = RW_S_LATCH; + } + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + /* Open pcur to the index */ + + btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, + node->latch_mode, &(plan->pcur), + has_search_latch, mtr); + } else { + /* Open the cursor to the start or the end of the index + (FALSE: no init) */ + + btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode, + &(plan->pcur), FALSE, mtr); + } + + ut_ad(plan->n_rows_prefetched == 0); + ut_ad(plan->n_rows_fetched == 0); + ut_ad(plan->cursor_at_end == FALSE); + + plan->pcur_is_open = TRUE; +} + +/************************************************************************* +Restores a stored pcur position to a table index. */ +UNIV_INLINE +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + /* out: TRUE if the cursor should be moved to + the next record after we return from this + function (moved to the previous, in the case + of a descending cursor) without processing + again the current cursor record */ + sel_node_t* node, /* in: select node */ + plan_t* plan, /* in: table plan */ + mtr_t* mtr) /* in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = btr_pcur_restore_position(node->latch_mode, + &(plan->pcur), mtr); + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER); + + return(TRUE); +} + +/************************************************************************* +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /* in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +/************************************************************************* +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ + sel_node_t* node, /* in: select node for a consistent read */ + plan_t* plan, /* in: plan for a unique search in clustered + index */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + rec_t* rec; + + index = plan->index; + + ut_ad(node->read_view); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + + row_sel_open_pcur(node, plan, TRUE, mtr); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (index->type & DICT_CLUSTERED) { + if (!lock_clust_rec_cons_read_sees(rec, index, + node->read_view)) { + return(SEL_RETRY); + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { + + return(SEL_RETRY); + } + + /* Test deleted flag. Fetch the columns needed in test conditions. */ + + row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + + if (rec_get_deleted_flag(rec)) { + + return(SEL_EXHAUSTED); + } + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + return(SEL_EXHAUSTED); + } + + ut_ad(plan->pcur.latch_mode == node->latch_mode); + + plan->n_rows_fetched++; + + return(SEL_FOUND); +} + +/************************************************************************* +Performs a select step. */ +static +ulint +row_sel( +/*====*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + ibool search_latch_locked; + ibool consistent_read; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool leaf_contains_updates = FALSE; + /* TRUE if select_will_do_update is + TRUE and the current clustered index + leaf page has been updated during + the current mtr: mtr must be committed + at the same time as the leaf x-latch + is released */ + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + ulint found_flag; + ulint err; + + ut_ad(thr->run_node == node); + + search_latch_locked = FALSE; + + if (node->read_view) { + /* In consistent reads, we try to do with the hash index and + not to use the buffer page get. This is to reduce memory bus + load resulting from semaphore operations. The search latch + will be s-locked when we access an index with a unique search + condition, but not locked when we access an index with a + less selective search condition. */ + + consistent_read = TRUE; + } else { + consistent_read = FALSE; + } + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(leaf_contains_updates == FALSE); + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr_start(&mtr); + + if (consistent_read && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust) { + if (!search_latch_locked) { + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; + } else if (btr_search_latch.writer_is_wait_ex) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often + kept for some 10 searches before being released, + a waiting x-latch request would block other threads + from acquiring an s-latch for a long time, lowering + performance significantly in multiprocessors. */ + + rw_lock_s_unlock(&btr_search_latch); + rw_lock_s_lock(&btr_search_latch); + } + + found_flag = row_sel_try_search_shortcut(node, plan, &mtr); + + if (found_flag == SEL_FOUND) { + + goto next_table; + + } else if (found_flag == SEL_EXHAUSTED) { + + goto table_exhausted; + } + + ut_ad(found_flag == SEL_RETRY); + + plan_reset_cursor(plan); + + mtr_commit(&mtr); + mtr_start(&mtr); + } + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + + search_latch_locked = FALSE; + } + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + + row_sel_open_pcur(node, plan, search_latch_locked, &mtr); + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && (rec != page_get_supremum_rec(buf_frame_align(rec)))) { + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!consistent_read) { + err = sel_set_rec_lock(page_rec_get_next(rec), index, + node->row_lock_mode, thr); + if (err != DB_SUCCESS) { + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + + goto lock_wait_or_error; + } + } + } + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (!consistent_read) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } else if (plan->mixed_index) { + /* We have to check if the record in a mixed cluster belongs + to this table */ + + if (!dict_is_mixed_table_rec(plan->table, rec)) { + + goto next_rec; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + if (consistent_read) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (index->type & DICT_CLUSTERED) { + + if (!lock_clust_rec_cons_read_sees(rec, index, + node->read_view)) { + + err = row_sel_build_prev_vers(node->read_view, + plan, rec, &old_vers, + &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + row_sel_fetch_columns(index, rec, + UT_LIST_GET_FIRST(plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, + node->read_view)) { + cons_read_requires_clust_rec = TRUE; + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions */ + + row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(consistent_read); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec)) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == node->latch_mode); + + if (node->select_will_do_update) { + /* This is a searched update and we can do the update in-place, + saving CPU time */ + + row_upd_in_place_in_select(node, thr, &mtr); + + leaf_contains_updates = TRUE; + + /* When the database is in the online backup mode, the number + of log records for a single mtr should be small: increment the + cost counter to ensure it */ + + cost_counter += 1 + (SEL_COST_LIMIT / 8); + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_push_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_pop_prefetched_row(plan); + + goto next_table; + } + +next_rec: + ut_ad(!search_latch_locked); + + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (leaf_contains_updates + && btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) { + + /* We must commit &mtr if we are moving to a different page, + because we have done updates to the x-latched leaf page, and + the latch would be released in btr_pcur_move_to_next, without + &mtr getting committed there */ + + ut_ad(node->asc); + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + return(DB_SUCCESS); + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + return(DB_SUCCESS); + } + + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + return(DB_SUCCESS); + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + return(DB_SUCCESS); + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + ut_ad(!search_latch_locked); + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr) + || !node->asc); + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + return(err); +} + +/************************************************************************** +Performs a select step. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_sel_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint i_lock_mode; + sym_node_t* table_node; + sel_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(thr_get_trx(thr)); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + /* Assign a read view for the query */ + node->read_view = trx_assign_read_view( + thr_get_trx(thr)); + } else { + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + table_node = node->table_list; + + while (table_node) { + err = lock_table(0, table_node->table, + i_lock_mode, thr); + if (err != DB_SUCCESS) { + + que_thr_handle_error(thr, DB_ERROR, + NULL, 0); + return(NULL); + } + + table_node = que_node_get_next(table_node); + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor && + UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + printf("SQL error %lu\n", err); + + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + + return(NULL); + } + + return(thr); +} + +/************************************************************************** +Performs a fetch for a cursor. */ + +que_thr_t* +fetch_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + sel_assign_into_var_values(node->into_list, sel_node); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + /* SQL error detected */ + printf("SQL error %lu\n", DB_ERROR); + + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/*************************************************************** +Prints a row in a select result. */ + +que_thr_t* +row_printf_step( +/*============*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = thr->run_node; + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + printf(" ::: "); + + arg = que_node_get_next(arg); + } + + printf("\n"); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/******************************************************************** +Converts a key value stored in MySQL format to an Innobase dtuple. +The last field of the key value may be just a prefix of a fixed length +field: hence the parameter key_len. */ + +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /* in: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /* in: buffer to use in field + conversions */ + dict_index_t* index, /* in: index of the key value */ + byte* key_ptr, /* in: MySQL key value */ + ulint key_len) /* in: MySQL key value length */ +{ + dfield_t* dfield; + ulint offset; + ulint len; + byte* key_end; + ulint n_fields = 0; + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + + if (dfield_get_type(dfield)->mtype == DATA_SYS) { + /* A special case: we are looking for a position in a + generated clustered index: the first and the only + ordering column is ROW_ID */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + offset = 0; + len = dfield_get_type(dfield)->len; + + n_fields++; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + offset = 1; + + if (*key_ptr != 0) { + dfield_set_data(dfield, NULL, UNIV_SQL_NULL); + + goto next_part; + } + } + + row_mysql_store_col_in_innobase_format( + dfield, buf, key_ptr + offset, len, + dfield_get_type(dfield)->mtype, + dfield_get_type(dfield)->prtype + & DATA_UNSIGNED); + next_part: + key_ptr += (offset + len); + + if (key_ptr > key_end) { + /* The last field in key was not a complete + field but a prefix of it */ + + ut_ad(dfield_get_len(dfield) != UNIV_SQL_NULL); + + dfield_set_data(dfield, buf, + len - (ulint)(key_ptr - key_end)); + } + + buf += len; + + dfield++; + } + + /* We set the length of tuple to n_fields: we assume that + the memory area allocated for it is big enough (usually + bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/****************************************************************** +Stores the row id to the prebuilt struct. */ +UNIV_INLINE +void +row_sel_store_row_id_to_prebuilt( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt */ + rec_t* index_rec, /* in: record */ + dict_index_t* index) /* in: index of the record */ +{ + byte* data; + ulint len; + + data = rec_get_nth_field(index_rec, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); + + ut_a(len == DATA_ROW_ID_LEN); + + ut_memcpy(prebuilt->row_id, data, len); +} + +/****************************************************************** +Stores a non-SQL-NULL field in the MySQL format. */ +UNIV_INLINE +void +row_sel_field_store_in_mysql_format( +/*================================*/ + byte* dest, /* in/out: buffer where to store; NOTE that BLOBs + are not in themselves stored here: the caller must + allocate and copy the BLOB into buffer before, and pass + the pointer to the BLOB in 'data' */ + ulint col_len,/* in: MySQL column length */ + byte* data, /* in: data to store */ + ulint len, /* in: length of the data */ + ulint type, /* in: data type */ + ulint is_unsigned)/* in: != 0 if an unsigned integer type */ +{ + byte* ptr; + + ut_ad(len != UNIV_SQL_NULL); + + if (type == DATA_INT) { + /* Convert integer data from Innobase to a little-endian + format, sign bit restored to normal */ + + ptr = dest + len; + + for (;;) { + ptr--; + *ptr = *data; + if (ptr == dest) { + break; + } + data++; + } + + if (!is_unsigned) { + dest[len - 1] = dest[len - 1] ^ 128; + } + + ut_ad(col_len == len); + } else if (type == DATA_VARCHAR || type == DATA_VARMYSQL + || type == DATA_BINARY) { + /* Store the length of the data to the first two bytes of + dest; does not do anything yet because MySQL has + no real vars! */ + + dest = row_mysql_store_var_len(dest, len); + ut_memcpy(dest, data, len); + + /* ut_ad(col_len >= len + 2); No real var implemented in + MySQL yet! */ + + } else if (type == DATA_BLOB) { + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, col_len, data, len); + } else { + ut_memcpy(dest, data, len); + ut_ad(col_len == len); + } +} + +/****************************************************************** +Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. */ +static +void +row_sel_store_mysql_rec( +/*====================*/ + byte* mysql_rec, /* out: row in the MySQL format */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec) /* in: Innobase record in the index + which was described in prebuilt's + template */ +{ + mysql_row_templ_t* templ; + byte* data; + ulint len; + byte* blob_buf; + ulint i; + + ut_ad(prebuilt->mysql_template); + + if (prebuilt->blob_heap != NULL) { + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + } + + /* Mark all columns as not SQL NULL */ + + memset(mysql_rec, '\0', prebuilt->null_bitmap_len); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + + data = rec_get_nth_field(rec, templ->rec_field_no, &len); + + if (len != UNIV_SQL_NULL) { + if (templ->type == DATA_BLOB) { + + /* Copy the BLOB data to the BLOB + heap of prebuilt */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = + mem_heap_create(len); + } + + blob_buf = mem_heap_alloc(prebuilt->blob_heap, + len); + ut_memcpy(blob_buf, data, len); + + data = blob_buf; + } + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, data, len, + templ->type, templ->is_unsigned); + } else { + mysql_rec[templ->mysql_null_byte_offset] |= + (byte) (templ->mysql_null_bit_mask); + } + } +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + dict_index_t* clust_index, /* in: clustered index */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec, /* in: record in a clustered index */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_consistent_read(rec, mtr, clust_index, + read_view, prebuilt->old_vers_heap, + old_vers); + return(err); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. */ +static +ulint +row_sel_get_clust_rec_for_mysql( +/*============================*/ + /* out: DB_SUCCESS or error code */ + row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */ + dict_index_t* sec_index,/* in: secondary index where rec resides */ + rec_t* rec, /* in: record in a non-clustered index */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + trx_t* trx; + + *out_rec = NULL; + + row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec); + + clust_index = dict_table_get_first_index(sec_index->table); + + btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + + ut_ad(page_rec_is_user_rec(clust_rec)); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record */ + + err = lock_clust_rec_read_check_and_lock(0, clust_rec, + clust_index, + prebuilt->select_lock_type, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + trx = thr_get_trx(thr); + + if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index, + trx->read_view)) { + + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, clust_rec, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + clust_rec = old_vers; + } + } + + *out_rec = clust_rec; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We may use the cursor in update: store its position */ + + btr_pcur_store_position(prebuilt->clust_pcur, mtr); + } + + return(DB_SUCCESS); +} + +/************************************************************************ +Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on can have been deleted. +Then we may have to move the cursor one step up or down. */ +static +ibool +sel_restore_position_for_mysql( +/*===========================*/ + /* out: TRUE if we may need to + process the record the cursor is + now positioned on (i.e. we should + not go to the next record yet) */ + ulint latch_mode, /* in: latch mode wished in + restoration */ + btr_pcur_t* pcur, /* in: cursor whose position + has been stored */ + ibool moves_up, /* in: TRUE if the cursor moves up + in the index */ + mtr_t* mtr) /* in: mtr; CAUTION: may commit + mtr temporarily! */ +{ + ibool success; + ulint relative_position; + + relative_position = pcur->rel_pos; + + success = btr_pcur_restore_position(latch_mode, pcur, mtr); + + if (relative_position == BTR_PCUR_ON) { + if (success) { + return(FALSE); + } + + if (moves_up) { + btr_pcur_move_to_next(pcur, mtr); + + return(TRUE); + } + + return(TRUE); + } + + if (relative_position == BTR_PCUR_AFTER) { + if (moves_up) { + return(TRUE); + } + + if (btr_pcur_is_on_user_rec(pcur, mtr)) { + btr_pcur_move_to_prev(pcur, mtr); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_BEFORE); + + if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); +} + +/************************************************************************ +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_pop_cached_row_for_mysql( +/*=============================*/ + byte* buf, /* in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct */ +{ + ut_ad(prebuilt->n_fetch_cached > 0); + + ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_row_len); + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/************************************************************************ +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_push_cache_row_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec) /* in: record to push */ +{ + ulint i; + + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + prebuilt->fetch_cache[i] = mem_alloc( + prebuilt->mysql_row_len); + } + } + + ut_ad(prebuilt->fetch_cache_first == 0); + + row_sel_store_mysql_rec( + prebuilt->fetch_cache[prebuilt->n_fetch_cached], + prebuilt, rec); + + prebuilt->n_fetch_cached++; +} + +/************************************************************************ +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! */ + +ulint +row_search_for_mysql( +/*=================*/ + /* out: DB_SUCCESS, + DB_RECORD_NOT_FOUND, + DB_END_OF_INDEX, or DB_DEADLOCK */ + byte* buf, /* in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /* in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /* in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /* in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +{ + dict_index_t* index = prebuilt->index; + dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + rec_t* rec; + rec_t* index_rec; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + ibool moved; + ibool cons_read_requires_clust_rec; + ibool was_lock_wait; + ulint ret; + ibool unique_search_from_clust_index = FALSE; + ibool mtr_has_extra_clust_latch = FALSE; + ibool moves_up = FALSE; + mtr_t mtr; + + ut_ad(index && pcur && search_tuple); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + ut_ad(sync_thread_levels_empty_gen(FALSE)); + + if (direction == 0) { + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (direction != prebuilt->fetch_direction) { + if (prebuilt->n_fetch_cached > 0) { + ut_a(0); + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + + return(DB_SUCCESS); + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { + + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + + return(DB_RECORD_NOT_FOUND); + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + if (match_mode == ROW_SEL_EXACT && index->type & DICT_UNIQUE + && index->type & DICT_CLUSTERED + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index)) { + + if (direction == ROW_SEL_NEXT) { + /* MySQL sometimes seems to do fetch next even + if the search condition is unique; we do not store + pcur position in this case, so we cannot + restore cursor position, and must return + immediately */ + + return(DB_RECORD_NOT_FOUND); + } + + ut_a(direction == 0); /* We cannot do fetch prev, as we have + not stored the cursor position */ + mode = PAGE_CUR_GE; + + unique_search_from_clust_index = TRUE; + } + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (direction == 0) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { + moves_up = TRUE; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = TRUE; + } + + mtr_start(&mtr); + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + clust_index = dict_table_get_first_index(index->table); + + if (direction != 0) { + moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (!moved) { + goto next_rec; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + + btr_pcur_open_with_no_init(index, search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, 0, &mtr); + } else { + if (mode == PAGE_CUR_G) { + btr_pcur_open_at_index_side(TRUE, index, + BTR_SEARCH_LEAF, pcur, FALSE, &mtr); + } else if (mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side(FALSE, index, + BTR_SEARCH_LEAF, pcur, FALSE, &mtr); + } + } + + if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + + } else if (prebuilt->select_lock_type == LOCK_NONE) { + /* This is a consistent read */ + trx_start_if_not_started(trx); + + /* Assign a read view for the query */ + + trx_assign_read_view(trx); + prebuilt->sql_stat_start = FALSE; + } else { + trx_start_if_not_started(trx); + + if (prebuilt->select_lock_type == LOCK_S) { + err = lock_table(0, index->table, LOCK_IS, thr); + } else { + err = lock_table(0, index->table, LOCK_IX, thr); + } + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + prebuilt->sql_stat_start = FALSE; + } + + /*-------------------------------------------------------------*/ +rec_loop: + cons_read_requires_clust_rec = FALSE; + + rec = btr_pcur_get_rec(pcur); + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type, + thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (unique_search_from_clust_index && btr_pcur_get_up_match(pcur) + == dtuple_get_n_fields(search_tuple)) { + /* The record matches enough */ + + ut_ad(mode == PAGE_CUR_GE); + + } else if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + if (0 != cmp_dtuple_rec(search_tuple, rec)) { + + btr_pcur_store_position(pcur, &mtr); + + ret = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) { + + btr_pcur_store_position(pcur, &mtr); + + ret = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* Get the right version of the row in a consistent read */ + + if (prebuilt->select_lock_type == LOCK_NONE) { + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + cons_read_requires_clust_rec = FALSE; + + if (index == clust_index) { + + if (!lock_clust_rec_cons_read_sees(rec, index, + trx->read_view)) { + + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, rec, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, + trx->read_view)) { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + cons_read_requires_clust_rec = TRUE; + } + } + + if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + goto next_rec; + } + + /* Get the clustered index record if needed and if we did + not do the search using the clustered index */ + + index_rec = rec; + + if (index != clust_index && (cons_read_requires_clust_rec + || prebuilt->need_to_access_clustered)) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_has_extra_clust_latch = TRUE; + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec)) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + rec = clust_rec; + } + + /* We found a qualifying row */ + + if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD + && !prebuilt->templ_contains_blob + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->clust_index_was_generated) { + + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE */ + + row_sel_push_cache_row_for_mysql(prebuilt, rec); + + if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { + + goto got_row; + } + + goto next_rec; + } else { + row_sel_store_mysql_rec(buf, prebuilt, rec); + + if (prebuilt->clust_index_was_generated) { + row_sel_store_row_id_to_prebuilt(prebuilt, index_rec, + index); + } + } +got_row: + /* TODO: should we in every case store the cursor position, even + if this is just a join, for example? */ + + if (!unique_search_from_clust_index + || prebuilt->select_lock_type == LOCK_X) { + + /* Inside an update always store the cursor position */ + + btr_pcur_store_position(pcur, &mtr); + } + + ret = DB_SUCCESS; + + goto normal_return; + /*-------------------------------------------------------------*/ +next_rec: + if (mtr_has_extra_clust_latch) { + /* We must commit mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + mtr_start(&mtr); + moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (moved) { + goto rec_loop; + } + } + + if (moves_up) { + moved = btr_pcur_move_to_next(pcur, &mtr); + } else { + moved = btr_pcur_move_to_prev(pcur, &mtr); + } + + if (!moved) { + btr_pcur_store_position(pcur, &mtr); + + if (match_mode != 0) { + ret = DB_RECORD_NOT_FOUND; + } else { + ret = DB_END_OF_INDEX; + } + + goto normal_return; + } + + goto rec_loop; + /*-------------------------------------------------------------*/ +lock_wait_or_error: + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + trx->error_state = err; + + /* The following is a patch for MySQL */ + + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + mtr_start(&mtr); + + sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + mode = pcur->search_mode; + + goto rec_loop; + } + + return(err); + +normal_return: + que_thr_stop_for_mysql_no_error(thr, trx); + + mtr_commit(&mtr); + + if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + ret = DB_SUCCESS; + } + + return(ret); +} diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c new file mode 100644 index 00000000000..68115895dbb --- /dev/null +++ b/innobase/row/row0uins.c @@ -0,0 +1,308 @@ +/****************************************************** +Fresh insert undo + +(c) 1996 Innobase Oy + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" + +#ifdef UNIV_NONINL +#include "row0uins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" + +/******************************************************************* +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. */ +static +ulint +row_undo_ins_remove_clust_rec( +/*==========================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: undo node */ + que_thr_t* thr) /* in: query thread */ +{ + btr_cur_t* btr_cur; + ibool success; + ulint err; + ulint n_tries = 0; + mtr_t mtr; + + UT_NOT_USED(thr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), + &mtr); + ut_a(success); + + if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + + /* Drop the index tree associated with the row in + SYS_INDEXES table: */ + + dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); + + mtr_commit(&mtr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, + &(node->pcur), &mtr); + ut_a(success); + } + + btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); + + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (success) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_TREE, + &(node->pcur), &mtr); + ut_a(success); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + trx_undo_rec_release(node->trx, node->undo_no); + + return(err); +} + +/******************************************************************* +Removes a secondary index entry if found. */ +static +ulint +row_undo_ins_remove_sec_low( +/*========================*/ + /* out: DB_SUCCESS, DB_FAIL, or + DB_OUT_OF_FILE_SPACE */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to remove */ + que_thr_t* thr) /* in: query thread */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool found; + ibool success; + ulint err; + mtr_t mtr; + + UT_NOT_USED(thr); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* Not found */ + + /* FIXME: remove printfs in the final version */ + + /* printf( + "--UNDO INS: Record not found from page %lu index %s\n", + buf_frame_get_page_no(btr_cur_get_rec(btr_cur)), + index->name); */ + + /* ibuf_print(); */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/******************************************************************* +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. */ +static +ulint +row_undo_ins_remove_sec( +/*====================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry, thr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/*************************************************************** +Parses the row reference and other info in a fresh insert undo record. */ +static +void +row_undo_ins_parse_undo_rec( +/*========================*/ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + ulint type; + ulint dummy; + + ut_ad(node && thr); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, &undo_no, + &table_id); + ut_ad(type == TRX_UNDO_INSERT_REC); + node->rec_type = type; + + /* NOTE that the table has to be explicitly released later */ + node->table = dict_table_get_on_id(table_id, node->trx); + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); +} + +/*************************************************************** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. */ + +ulint +row_undo_ins( +/*=========*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dtuple_t* entry; + ibool found; + ulint err; + + ut_ad(node && thr); + ut_ad(node->state == UNDO_NODE_INSERT); + + row_undo_ins_parse_undo_rec(node, thr); + + found = row_undo_search_clust_to_pcur(node, thr); + + if (!found) { + return(DB_SUCCESS); + } + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + while (node->index != NULL) { + entry = row_build_index_entry(node->row, node->index, + node->heap); + err = row_undo_ins_remove_sec(node->index, entry, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + err = row_undo_ins_remove_clust_rec(node, thr); + + return(err); +} diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c new file mode 100644 index 00000000000..2aa223a6186 --- /dev/null +++ b/innobase/row/row0umod.c @@ -0,0 +1,608 @@ +/****************************************************** +Undo modify of a row + +(c) 1997 Innobase Oy + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#include "row0umod.h" + +#ifdef UNIV_NONINL +#include "row0umod.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "log0log.h" + +/* Considerations on undoing a modify operation. +(1) Undoing a delete marking: all index records should be found. Some of +them may have delete mark already FALSE, if the delete mark operation was +stopped underway, or if the undo operation ended prematurely because of a +system crash. +(2) Undoing an update of a delete unmarked record: the newer version of +an updated secondary index entry should be removed if no prior version +of the clustered index record requires its existence. Otherwise, it should +be delete marked. +(3) Undoing an update of a delete marked record. In this kind of update a +delete marked clustered index record was delete unmarked and possibly also +some of its fields were changed. Now, it is possible that the delete marked +version has become obsolete at the time the undo is started. */ + +/*************************************************************** +Checks if also the previous version of the clustered index record was +modified or inserted by the same transaction, and its undo number is such +that it should be undone in the same rollback. */ +UNIV_INLINE +ibool +row_undo_mod_undo_also_prev_vers( +/*=============================*/ + /* out: TRUE if also previous modify or + insert of this row should be undone */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dulint* undo_no)/* out: the undo number */ +{ + trx_undo_rec_t* undo_rec; + ibool ret; + trx_t* trx; + + UT_NOT_USED(thr); + + trx = node->trx; + + if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { + + return(FALSE); + } + + undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); + + *undo_no = trx_undo_rec_get_undo_no(undo_rec); + + if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) { + ret = TRUE; + } else { + ret = FALSE; + } + + return(ret); +} + +/*************************************************************** +Undoes a modify in a clustered index record. */ +static +ulint +row_undo_mod_clust_low( +/*===================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code: + we may run out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr, /* in: mtr */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + ibool do_remove; + + index = dict_table_get_first_index(node->table); + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + ut_ad(success); + + /* Find out if we can remove the whole clustered index record */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC + && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { + + do_remove = TRUE; + } else { + do_remove = FALSE; + } + + if (mode == BTR_MODIFY_LEAF) { + + if (do_remove) { + success = btr_cur_optimistic_delete(btr_cur, mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + if (do_remove) { + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } else { + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } + } + + return(err); +} + +/*************************************************************** +Undoes a modify in a clustered index record. Sets also the node state for the +next round of undo. */ +static +ulint +row_undo_mod_clust( +/*===============*/ + /* out: DB_SUCCESS or error code: we may run + out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + btr_pcur_t* pcur; + mtr_t mtr; + ulint err; + ibool success; + ibool more_vers; + dulint new_undo_no; + + ut_ad(node && thr); + + /* Check if also the previous version of the clustered index record + should be undone in this same rollback operation */ + + more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no); + + pcur = &(node->pcur); + + mtr_start(&mtr); + + /* Try optimistic processing of the record, keeping changes within + the index page */ + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF); + + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a pessimistic + descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + } + + node->state = UNDO_NODE_FETCH_NEXT; + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + trx_undo_rec_release(node->trx, node->undo_no); + + if (more_vers && err == DB_SUCCESS) { + + /* Reserve the undo log record to the prior version after + committing &mtr: this is necessary to comply with the latching + order, as &mtr may contain the fsp latch which is lower in + the latch hierarchy than trx->undo_mutex. */ + + success = trx_undo_rec_reserve(node->trx, new_undo_no); + + if (success) { + node->state = UNDO_NODE_PREV_VERS; + } + } + + return(err); +} + +/*************************************************************** +Delete marks or removes a secondary index entry if found. */ +static +ulint +row_undo_mod_del_mark_or_remove_sec_low( +/*====================================*/ + /* out: DB_SUCCESS, DB_FAIL, or + DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode) /* in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + ibool found; + mtr_t mtr; + mtr_t mtr_vers; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has; + ulint err; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* Not found */ + + /* FIXME: remove printfs in the final version */ + + /* printf( + "--UNDO MOD: Record not found from page %lu index %s\n", + buf_frame_get_page_no(btr_cur_get_rec(btr_cur)), + index->name); */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + /* We should remove the index record if no prior version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should delete mark the record. */ + + mtr_start(&mtr_vers); + + success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur), + &mtr_vers); + ut_ad(success); + + old_has = row_vers_old_has_index_entry(FALSE, + btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + if (old_has) { + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, &mtr); + ut_ad(err == DB_SUCCESS); + } else { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/*************************************************************** +Delete marks or removes a secondary index entry if found. */ +UNIV_INLINE +ulint +row_undo_mod_del_mark_or_remove_sec( +/*================================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + ulint err; + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_LEAF); + if (err == DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_TREE); + return(err); +} + +/*************************************************************** +Delete unmarks a secondary index entry which must be found. */ +static +void +row_undo_mod_del_unmark_sec( +/*========================*/ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + mtr_t mtr; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ulint err; + ibool found; + + UT_NOT_USED(node); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, + &mtr); + ut_a(found); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); + ut_ad(err == DB_SUCCESS); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is UPD_DEL. */ +static +ulint +row_undo_mod_upd_del_sec( +/*=====================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, index, heap); + + err = row_undo_mod_del_mark_or_remove_sec(node, thr, index, + entry); + if (err != DB_SUCCESS) { + + mem_heap_free(heap); + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is DEL_MARK. */ +static +ulint +row_undo_mod_del_mark_sec( +/*======================*/ + /* out: DB_SUCCESS */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, index, heap); + + row_undo_mod_del_unmark_sec(node, thr, index, entry); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */ +static +ulint +row_undo_mod_upd_exist_sec( +/*=======================*/ + /* out: DB_SUCCESS or error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + /* No change in secondary indexes */ + + return(DB_SUCCESS); + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field(node->row, node->index, + node->update)) { + + /* Build the newest version of the index entry */ + entry = row_build_index_entry(node->row, index, heap); + + err = row_undo_mod_del_mark_or_remove_sec(node, thr, + index, entry); + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + + /* We may have to update the delete mark in the + secondary index record of the previous version of + the row */ + + row_upd_index_replace_new_col_vals(entry, index, + node->update); + + row_undo_mod_del_unmark_sec(node, thr, index, entry); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Parses the row reference and other info in a modify undo log record. */ +static +void +row_undo_mod_parse_undo_rec( +/*========================*/ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + + ut_ad(node && thr); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + &undo_no, &table_id); + node->rec_type = type; + + /* NOTE that the table has to be explicitly released later */ + node->table = dict_table_get_on_id(table_id, thr_get_trx(thr)); + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, node->heap, + &(node->update)); + node->new_roll_ptr = roll_ptr; + node->new_trx_id = trx_id; + node->cmpl_info = cmpl_info; +} + +/*************************************************************** +Undoes a modify operation on a row of a table. */ + +ulint +row_undo_mod( +/*=========*/ + /* out: DB_SUCCESS or error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + ibool found; + ulint err; + + ut_ad(node && thr); + ut_ad(node->state == UNDO_NODE_MODIFY); + + row_undo_mod_parse_undo_rec(node, thr); + + found = row_undo_search_clust_to_pcur(node, thr); + + if (!found) { + /* It is already undone, or will be undone by another query + thread */ + + node->state = UNDO_NODE_FETCH_NEXT; + + return(DB_SUCCESS); + } + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + err = row_undo_mod_upd_exist_sec(node, thr); + + } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + + err = row_undo_mod_del_mark_sec(node, thr); + } else { + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + err = row_undo_mod_upd_del_sec(node, thr); + } + + if (err != DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_clust(node, thr); + + return(err); +} diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c new file mode 100644 index 00000000000..6dc032f7e13 --- /dev/null +++ b/innobase/row/row0undo.c @@ -0,0 +1,313 @@ +/****************************************************** +Row undo + +(c) 1997 Innobase Oy + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#include "row0undo.h" + +#ifdef UNIV_NONINL +#include "row0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0uins.h" +#include "row0umod.h" +#include "srv0srv.h" + +/* How to undo row operations? +(1) For an insert, we have stored a prefix of the clustered index record +in the undo log. Using it, we look for the clustered record, and using +that we look for the records in the secondary indexes. The insert operation +may have been left incomplete, if the database crashed, for example. +We may have look at the trx id and roll ptr to make sure the record in the +clustered index is really the one for which the undo log record was +written. We can use the framework we get from the original insert op. +(2) Delete marking: We can use the framework we get from the original +delete mark op. We only have to check the trx id. +(3) Update: This may be the most complicated. We have to use the framework +we get from the original update op. + +What if the same trx repeatedly deletes and inserts an identical row. +Then the row id changes and also roll ptr. What if the row id was not +part of the ordering fields in the clustered index? Maybe we have to write +it to undo log. Well, maybe not, because if we order the row id and trx id +in descending order, then the only undeleted copy is the first in the +index. Our searches in row operations always position the cursor before +the first record in the result set. But, if there is no key defined for +a table, then it would be desirable that row id is in ascending order. +So, lets store row id in descending order only if it is not an ordering +field in the clustered index. + +NOTE: Deletes and inserts may lead to situation where there are identical +records in a secondary index. Is that a problem in the B-tree? Yes. +Also updates can lead to this, unless trx id and roll ptr are included in +ord fields. +(1) Fix in clustered indexes: include row id, trx id, and roll ptr +in node pointers of B-tree. +(2) Fix in secondary indexes: include all fields in node pointers, and +if an entry is inserted, check if it is equal to the right neighbor, +in which case update the right neighbor: the neighbor must be delete +marked, set it unmarked and write the trx id of the current transaction. + +What if the same trx repeatedly updates the same row, updating a secondary +index field or not? Updating a clustered index ordering field? + +(1) If it does not update the secondary index and not the clustered index +ord field. Then the secondary index record stays unchanged, but the +trx id in the secondary index record may be smaller than in the clustered +index record. This is no problem? +(2) If it updates secondary index ord field but not clustered: then in +secondary index there are delete marked records, which differ in an +ord field. No problem. +(3) Updates clustered ord field but not secondary, and secondary index +is unique. Then the record in secondary index is just updated at the +clustered ord field. +(4) + +Problem with duplicate records: +Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a +bigger trx id has inserted and delete marked a similar row, our trx inserts +again a similar row, and a trx with an even bigger id delete marks it. Then +the position of the row should change in the index if the trx id affects +the alphabetical ordering. + +Fix 2: If an insert encounters a similar row marked deleted, we turn the +insert into an 'update' of the row marked deleted. Then we must write undo +info on the update. A problem: what if a purge operation tries to remove +the delete marked row? + +We can think of the database row versions as a linked list which starts +from the record in the clustered index, and is linked by roll ptrs +through undo logs. The secondary index records are references which tell +what kinds of records can be found in this linked list for a record +in the clustered index. + +How to do the purge? A record can be removed from the clustered index +if its linked list becomes empty, i.e., the row has been marked deleted +and its roll ptr points to the record in the undo log we are going through, +doing the purge. Similarly, during a rollback, a record can be removed +if the stored roll ptr in the undo log points to a trx already (being) purged, +or if the roll ptr is NULL, i.e., it was a fresh insert. */ + +/************************************************************************ +Creates a row undo node to a query graph. */ + +undo_node_t* +row_undo_node_create( +/*=================*/ + /* out, own: undo node */ + trx_t* trx, /* in: transaction */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + undo_node_t* undo; + + ut_ad(trx && parent && heap); + + undo = mem_heap_alloc(heap, sizeof(undo_node_t)); + + undo->common.type = QUE_NODE_UNDO; + undo->common.parent = parent; + + undo->state = UNDO_NODE_FETCH_NEXT; + undo->trx = trx; + + undo->heap = mem_heap_create(256); + + return(undo); +} + +/*************************************************************** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. */ + +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + /* out: TRUE if found; NOTE the node->pcur + must be closed by the caller, regardless of + the return value */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + ibool found; + mtr_t mtr; + ibool ret; + rec_t* rec; + + UT_NOT_USED(thr); + + mtr_start(&mtr); + + clust_index = dict_table_get_first_index(node->table); + + found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, + node->table, node->ref, &mtr); + + rec = btr_pcur_get_rec(&(node->pcur)); + + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, + row_get_rec_roll_ptr(rec, clust_index))) { + + /* We must remove the reservation on the undo log record + BEFORE releasing the latch on the clustered index page: this + is to make sure that some thread will eventually undo the + modification corresponding to node->roll_ptr. */ + + /* printf("--------------------undoing a previous version\n"); + */ + trx_undo_rec_release(node->trx, node->undo_no); + + ret = FALSE; + } else { + node->row = row_build(ROW_COPY_DATA, clust_index, rec, + node->heap); + btr_pcur_store_position(&(node->pcur), &mtr); + + ret = TRUE; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + return(ret); +} + +/*************************************************************** +Fetches an undo log record and does the undo for the recorded operation. +If none left, or a partial rollback completed, returns control to the +parent node, which is always a query thread node. */ +static +ulint +row_undo( +/*=====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + trx_t* trx; + dulint roll_ptr; + + ut_ad(node && thr); + + trx = node->trx; + + if (node->state == UNDO_NODE_FETCH_NEXT) { + + /* The call below also starts &mtr */ + node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, + trx->roll_limit, + &roll_ptr, + node->heap); + if (!node->undo_rec) { + /* Rollback completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + + } else if (node->state == UNDO_NODE_PREV_VERS) { + + /* Undo should be done to the same clustered index record + again in this same rollback, restoring the previous version */ + + roll_ptr = node->new_roll_ptr; + + node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr, + node->heap); + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + } + + if (node->state == UNDO_NODE_INSERT) { + + err = row_undo_ins(node, thr); + + node->state = UNDO_NODE_FETCH_NEXT; + } else { + ut_ad(node->state == UNDO_NODE_MODIFY); + err = row_undo_mod(node, thr); + } + + /* Do some cleanup */ + btr_pcur_close(&(node->pcur)); + + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(err); +} + +/*************************************************************** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. */ + +que_thr_t* +row_undo_step( +/*==========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + undo_node_t* node; + trx_t* trx; + + ut_ad(thr); + + srv_activity_count++; + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); + + err = row_undo(node, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* SQL error detected */ + + ut_a(0); + + return(NULL); + } + + return(thr); +} diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c new file mode 100644 index 00000000000..44843494247 --- /dev/null +++ b/innobase/row/row0upd.c @@ -0,0 +1,1394 @@ +/****************************************************** +Update of a row + +(c) 1996 Innobase Oy + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "row0upd.h" + +#ifdef UNIV_NONINL +#include "row0upd.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "mach0data.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "que0que.h" +#include "row0ins.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0sym.h" +#include "eval0eval.h" + + +/* What kind of latch and lock can we assume when the control comes to + ------------------------------------------------------------------- +an update node? +-------------- +Efficiency of massive updates would require keeping an x-latch on a +clustered index page through many updates, and not setting an explicit +x-lock on clustered index records, as they anyway will get an implicit +x-lock when they are updated. A problem is that the read nodes in the +graph should know that they must keep the latch when passing the control +up to the update node, and not set any record lock on the record which +will be updated. Another problem occurs if the execution is stopped, +as the kernel switches to another query thread, or the transaction must +wait for a lock. Then we should be able to release the latch and, maybe, +acquire an explicit x-lock on the record. + Because this seems too complicated, we conclude that the less +efficient solution of releasing all the latches when the control is +transferred to another node, and acquiring explicit x-locks, is better. */ + +/* How is a delete performed? If there is a delete without an +explicit cursor, i.e., a searched delete, there are at least +two different situations: +the implicit select cursor may run on (1) the clustered index or +on (2) a secondary index. The delete is performed by setting +the delete bit in the record and substituting the id of the +deleting transaction for the original trx id, and substituting a +new roll ptr for previous roll ptr. The old trx id and roll ptr +are saved in the undo log record. Thus, no physical changes occur +in the index tree structure at the time of the delete. Only +when the undo log is purged, the index records will be physically +deleted from the index trees. + +The query graph executing a searched delete would consist of +a delete node which has as a subtree a select subgraph. +The select subgraph should return a (persistent) cursor +in the clustered index, placed on page which is x-latched. +The delete node should look for all secondary index records for +this clustered index entry and mark them as deleted. When is +the x-latch freed? The most efficient way for performing a +searched delete is obviously to keep the x-latch for several +steps of query graph execution. */ + +/************************************************************************* +Creates an update node for a query graph. */ + +upd_node_t* +upd_node_create( +/*============*/ + /* out, own: update node */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + upd_node_t* node; + + node = mem_heap_alloc(heap, sizeof(upd_node_t)); + node->common.type = QUE_NODE_UPDATE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + node->select_will_do_update = FALSE; + node->in_mysql_interface = FALSE; + + node->row = NULL; + node->index = NULL; + + node->select = NULL; + + node->heap = mem_heap_create(128); + node->magic_n = UPD_NODE_MAGIC_N; + + node->cmpl_info = 0; + + return(node); +} + +/************************************************************************* +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ + +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /* in: record */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ +{ + byte* field; + ulint len; + + field = rec_get_nth_field(rec, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + trx_write_trx_id(field, trx_id); + + field = rec_get_nth_field(rec, pos + 1, &len); + ut_ad(len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(field, roll_ptr); +} + +/************************************************************************* +Sets the trx id or roll ptr field of a clustered index entry. */ + +void +row_upd_index_entry_sys_field( +/*==========================*/ + dtuple_t* entry, /* in: index entry, where the memory buffers + for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + dulint val) /* in: value to write */ +{ + dfield_t* dfield; + byte* field; + ulint pos; + + ut_ad(index->type & DICT_CLUSTERED); + + pos = dict_index_get_sys_col_pos(index, type); + + dfield = dtuple_get_nth_field(entry, pos); + field = dfield_get_data(dfield); + + if (type == DATA_TRX_ID) { + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + trx_write_roll_ptr(field, val); + } +} + +/*************************************************************** +Returns TRUE if row update changes size of some field in index. */ + +ibool +row_upd_changes_field_size( +/*=======================*/ + /* out: TRUE if the update changes the size of + some field in index */ + rec_t* rec, /* in: record in clustered index */ + dict_index_t* index, /* in: clustered index */ + upd_t* update) /* in: update vector */ +{ + upd_field_t* upd_field; + dfield_t* new_val; + ulint old_len; + ulint new_len; + ulint n_fields; + ulint i; + + ut_ad(index->type & DICT_CLUSTERED); + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + new_len = new_val->len; + + if (new_len == UNIV_SQL_NULL) { + new_len = dtype_get_sql_null_size( + dict_index_get_nth_type(index, i)); + } + + old_len = rec_get_nth_field_size(rec, upd_field->field_no); + + if (old_len != new_len) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the record +given. No field size changes are allowed. This function is used only for +a clustered index */ + +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /* in/out: record where replaced */ + upd_t* update) /* in: update vector */ +{ + upd_field_t* upd_field; + dfield_t* new_val; + ulint n_fields; + ulint i; + + rec_set_info_bits(rec, update->info_bits); + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + rec_set_nth_field(rec, upd_field->field_no, + dfield_get_data(new_val), + dfield_get_len(new_val)); + } +} + +/************************************************************************* +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. */ + +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + /* out: new pointer to mlog */ + dict_index_t* index, /* in: clustered index */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr,/* in: roll ptr of the undo log record */ + byte* log_ptr,/* pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr); + + log_ptr += mach_write_compressed(log_ptr, + dict_index_get_sys_col_pos(index, DATA_TRX_ID)); + + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + + log_ptr += mach_dulint_write_compressed(log_ptr, trx->id); + + return(log_ptr); +} + +/************************************************************************* +Parses the log data of system field values. */ + +byte* +row_upd_parse_sys_vals( +/*===================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint* pos, /* out: TRX_ID position in record */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr)/* out: roll ptr */ +{ + ptr = mach_parse_compressed(ptr, end_ptr, pos); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + DATA_ROLL_PTR_LEN) { + + return(NULL); + } + + *roll_ptr = trx_read_roll_ptr(ptr); + ptr += DATA_ROLL_PTR_LEN; + + ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id); + + return(ptr); +} + +/*************************************************************** +Writes to the redo log the new values of the fields occurring in the index. */ + +void +row_upd_index_write_log( +/*====================*/ + upd_t* update, /* in: update vector */ + byte* log_ptr,/* in: pointer to mlog buffer: must contain at least + MLOG_BUF_MARGIN bytes of free space; the buffer is + closed within this function */ + mtr_t* mtr) /* in: mtr into whose log to write */ +{ + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf_end; + ulint i; + + n_fields = upd_get_n_fields(update); + + buf_end = log_ptr + MLOG_BUF_MARGIN; + + mach_write_to_1(log_ptr, update->info_bits); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, n_fields); + + for (i = 0; i < n_fields; i++) { + + ut_ad(MLOG_BUF_MARGIN > 30); + + if (log_ptr + 30 > buf_end) { + mlog_close(mtr, log_ptr); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + + len = new_val->len; + + log_ptr += mach_write_compressed(log_ptr, upd_field->field_no); + log_ptr += mach_write_compressed(log_ptr, len); + + if (len != UNIV_SQL_NULL) { + if (log_ptr + len < buf_end) { + ut_memcpy(log_ptr, new_val->data, len); + + log_ptr += len; + } else { + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, new_val->data, len); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + } + } + + mlog_close(mtr, log_ptr); +} + +/************************************************************************* +Parses the log data written by row_upd_index_write_log. */ + +byte* +row_upd_index_parse( +/*================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + mem_heap_t* heap, /* in: memory heap where update vector is + built */ + upd_t** update_out)/* out: update vector */ +{ + upd_t* update; + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf; + ulint info_bits; + ulint i; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_bits = mach_read_from_1(ptr); + ptr++; + ptr = mach_parse_compressed(ptr, end_ptr, &n_fields); + + if (ptr == NULL) { + + return(NULL); + } + + update = upd_create(n_fields, heap); + update->info_bits = info_bits; + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + ptr = mach_parse_compressed(ptr, end_ptr, + &(upd_field->field_no)); + if (ptr == NULL) { + + return(NULL); + } + + ptr = mach_parse_compressed(ptr, end_ptr, &len); + + if (ptr == NULL) { + + return(NULL); + } + + new_val->len = len; + + if (len != UNIV_SQL_NULL) { + + if (end_ptr < ptr + len) { + + return(NULL); + } else { + buf = mem_heap_alloc(heap, len); + ut_memcpy(buf, ptr, len); + + ptr += len; + + new_val->data = buf; + } + } + } + + *update_out = update; + + return(ptr); +} + +/******************************************************************* +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. */ + +upd_t* +row_upd_build_difference( +/*=====================*/ + /* out, own: update vector of differing + fields, excluding roll ptr and trx id */ + dict_index_t* index, /* in: clustered index */ + dtuple_t* entry, /* in: entry to insert */ + rec_t* rec, /* in: clustered index record */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + dfield_t* dfield; + byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint roll_ptr_pos; + ulint trx_id_pos; + ulint i; + + /* This function is used only for a clustered index */ + ut_ad(index->type & DICT_CLUSTERED); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); + trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, i, &len); + dfield = dtuple_get_nth_field(entry, i); + + if ((i != trx_id_pos) && (i != roll_ptr_pos) + && !dfield_data_is_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index); + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ + +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /* in/out: index entry where replaced */ + dict_index_t* index, /* in: index; NOTE that may also be a + non-clustered index */ + upd_t* update) /* in: update vector */ +{ + upd_field_t* upd_field; + dfield_t* dfield; + dfield_t* new_val; + ulint field_no; + dict_index_t* clust_index; + ulint i; + + ut_ad(index); + + clust_index = dict_table_get_first_index(index->table); + + dtuple_set_info_bits(entry, update->info_bits); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + field_no = dict_index_get_nth_col_pos(index, + dict_index_get_nth_col_no(clust_index, + upd_field->field_no)); + if (field_no != ULINT_UNDEFINED) { + dfield = dtuple_get_nth_field(entry, field_no); + + new_val = &(upd_field->new_val); + + dfield_set_data(dfield, new_val->data, new_val->len); + } + } +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the +clustered index entry given. */ + +void +row_upd_clust_index_replace_new_col_vals( +/*=====================================*/ + dtuple_t* entry, /* in/out: index entry where replaced */ + upd_t* update) /* in: update vector */ +{ + upd_field_t* upd_field; + dfield_t* dfield; + dfield_t* new_val; + ulint field_no; + ulint i; + + dtuple_set_info_bits(entry, update->info_bits); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + field_no = upd_field->field_no; + + dfield = dtuple_get_nth_field(entry, field_no); + + new_val = &(upd_field->new_val); + + dfield_set_data(dfield, new_val->data, new_val->len); + } +} + +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. */ + +ibool +row_upd_changes_ord_field( +/*======================*/ + /* out: TRUE if update vector changes + an ordering field in the index record */ + dtuple_t* row, /* in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + dict_index_t* index, /* in: index of the record */ + upd_t* update) /* in: update vector for the row */ +{ + upd_field_t* upd_field; + dict_field_t* ind_field; + dict_col_t* col; + ulint n_unique; + ulint n_upd_fields; + ulint col_pos; + ulint col_no; + ulint i, j; + + ut_ad(update && index); + + n_unique = dict_index_get_n_unique(index); + n_upd_fields = upd_get_n_fields(update); + + for (i = 0; i < n_unique; i++) { + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col); + col_no = dict_col_get_no(col); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field = upd_get_nth_field(update, j); + + if (col_pos == upd_field->field_no + && (row == NULL + || !dfield_datas_are_equal( + dtuple_get_nth_field(row, col_no), + &(upd_field->new_val)))) { + return(TRUE); + } + } + } + + return(FALSE); +} + +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. */ + +ibool +row_upd_changes_some_index_ord_field( +/*=================================*/ + /* out: TRUE if update vector may change + an ordering field in an index record */ + dict_table_t* table, /* in: table */ + upd_t* update) /* in: update vector for the row */ +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index) { + if (row_upd_changes_ord_field(NULL, index, update)) { + + return(TRUE); + } + + index = dict_table_get_next_index(index); + } + + return(FALSE); +} + +/************************************************************************* +Copies the column values from a record. */ +UNIV_INLINE +void +row_upd_copy_columns( +/*=================*/ + rec_t* rec, /* in: record in a clustered index */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + byte* data; + ulint len; + + while (column) { + data = rec_get_nth_field(rec, + column->field_nos[SYM_CLUST_FIELD_NO], + &len); + eval_node_copy_and_alloc_val(column, data, len); + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Calculates the new values for fields to update. Note that row_upd_copy_columns +must have been called first. */ +UNIV_INLINE +void +row_upd_eval_new_vals( +/*==================*/ + upd_t* update) /* in: update vector */ +{ + que_node_t* exp; + upd_field_t* upd_field; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + exp = upd_field->exp; + + eval_exp(exp); + + dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp)); + } +} + +/*************************************************************** +Stores to the heap the row on which the node->pcur is positioned. */ +UNIV_INLINE +void +row_upd_store_row( +/*==============*/ + upd_node_t* node) /* in: row update node */ +{ + dict_index_t* clust_index; + + ut_ad((node->pcur)->latch_mode != BTR_NO_LATCHES); + + if (node->row != NULL) { + mem_heap_empty(node->heap); + node->row = NULL; + } + + clust_index = dict_table_get_first_index(node->table); + + node->row = row_build(ROW_COPY_DATA, clust_index, + btr_pcur_get_rec(node->pcur), node->heap); +} + +/*************************************************************** +Updates a secondary index entry of a row. */ +static +ulint +row_upd_sec_index_entry( +/*====================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ibool found; + dict_index_t* index; + dtuple_t* entry; + mtr_t mtr; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + mem_heap_t* heap; + rec_t* rec; + ulint err; + + index = node->index; + + heap = mem_heap_create(1024); + + /* Build old index entry */ + entry = row_build_index_entry(node->row, index, heap); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, + &mtr); + ut_ad(found); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + rec = btr_cur_get_rec(btr_cur); + + /* Delete mark the old index record; it can already be delete marked if + we return after a lock wait in row_ins_index_entry below */ + + if (!rec_get_deleted_flag(rec)) { + err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, thr, + &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->is_delete || (err != DB_SUCCESS)) { + + mem_heap_free(heap); + + return(err); + } + + /* Build a new index entry */ + row_upd_index_replace_new_col_vals(entry, index, node->update); + + /* Insert new index entry */ + err = row_ins_index_entry(index, entry, thr); + + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Updates secondary index record if it is changed in the row update. This +should be quite rare in database applications. */ +UNIV_INLINE +ulint +row_upd_sec_step( +/*=============*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC) + || (node->state == UPD_NODE_UPDATE_SOME_SEC)); + ut_ad(!(node->index->type & DICT_CLUSTERED)); + + if ((node->state == UPD_NODE_UPDATE_ALL_SEC) + || row_upd_changes_ord_field(node->row, node->index, + node->update)) { + err = row_upd_sec_index_entry(node, thr); + + return(err); + } + + return(DB_SUCCESS); +} + +/*************************************************************** +Marks the clustered index record deleted and inserts the updated version +of the record to the index. This function should be used when the ordering +fields of the clustered index record change. This should be quite rare in +database applications. */ +static +ulint +row_upd_clust_rec_by_insert( +/*========================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index of the record */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + trx_t* trx; + dict_table_t* table; + mem_heap_t* heap; + dtuple_t* entry; + ulint err; + + ut_ad(node); + ut_ad(index->type & DICT_CLUSTERED); + + trx = thr_get_trx(thr); + table = node->table; + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + if (node->state != UPD_NODE_INSERT_CLUSTERED) { + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + + return(err); + } + } + + mtr_commit(mtr); + + node->state = UPD_NODE_INSERT_CLUSTERED; + + heap = mem_heap_create(1024); + + entry = row_build_index_entry(node->row, index, heap); + + row_upd_clust_index_replace_new_col_vals(entry, node->update); + + row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); + + err = row_ins_index_entry(index, entry, thr); + + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Updates a clustered index record of a row when the ordering fields do +not change. */ +static +ulint +row_upd_clust_rec( +/*==============*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(index->type & DICT_CLUSTERED); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + + /* Try optimistic updating of the record, keeping changes within + the page; we do not check locks because we assume the x-lock on the + record to update */ + + if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { + err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } + + mtr_commit(mtr); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* We may have to modify the tree structure: do a pessimistic descent + down the index tree */ + + mtr_start(mtr); + + /* NOTE: this transaction has an s-lock or x-lock on the record and + therefore other transactions cannot modify the record when we have no + latch on the page. In addition, we assume that other query threads of + the same transaction do not modify the record in the meantime. + Therefore we can assert that the restoration of the cursor succeeds. */ + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + + ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, + node->update, node->cmpl_info, thr, mtr); + mtr_commit(mtr); + + return(err); +} + +/*************************************************************** +Delete marks a clustered index record. */ +static +ulint +row_upd_del_mark_clust_rec( +/*=======================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(node->is_delete); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + + /* Store row because we have to build also the secondary index + entries */ + + row_upd_store_row(node); + + /* Mark the clustered index record deleted; we do not have to check + locks, because we assume that we have an x-lock on the record */ + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur, + TRUE, thr, mtr); + mtr_commit(mtr); + + return(err); +} + +/*************************************************************** +Updates the clustered index record. */ +static +ulint +row_upd_clust_step( +/*===============*/ + /* out: DB_SUCCESS if operation successfully + completed, DB_LOCK_WAIT in case of a lock wait, + else error code */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + ibool success; + ulint err; + mtr_t mtr_buf; + mtr_t* mtr; + + index = dict_table_get_first_index(node->table); + + pcur = node->pcur; + + /* We have to restore the cursor to its position */ + mtr = &mtr_buf; + + mtr_start(mtr); + + /* If the restoration does not succeed, then the same + transaction has deleted the record on which the cursor was, + and that is an SQL error. If the restoration succeeds, it may + still be that the same transaction has successively deleted + and inserted a record with the same ordering fields, but in + that case we know that the transaction has at least an + implicit x-lock on the record. */ + + ut_a(pcur->rel_pos == BTR_PCUR_ON); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + + if (!success) { + err = DB_RECORD_NOT_FOUND; + + mtr_commit(mtr); + + return(err); + } + + /* If this is a row in SYS_INDEXES table of the data dictionary, + then we have to free the file segments of the index tree associated + with the index */ + + if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + + dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, + mtr); + if (!success) { + err = DB_ERROR; + + mtr_commit(mtr); + + return(err); + } + } + + if (!node->has_clust_rec_x_lock) { + err = lock_clust_rec_modify_check_and_lock(0, + btr_pcur_get_rec(pcur), + index, thr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + + return(err); + } + } + + /* NOTE: the following function calls will also commit mtr */ + + if (node->is_delete) { + err = row_upd_del_mark_clust_rec(node, index, thr, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + + return(err); + } + + /* If the update is made for MySQL, we already have the update vector + ready, else we have to do some evaluation: */ + + if (!node->in_mysql_interface) { + /* Copy the necessary columns from clust_rec and calculate the + new values to set */ + + row_upd_copy_columns(btr_pcur_get_rec(pcur), + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + } + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + err = row_upd_clust_rec(node, index, thr, mtr); + + return(err); + } + + row_upd_store_row(node); + + if (row_upd_changes_ord_field(node->row, index, node->update)) { + + /* Update causes an ordering field (ordering fields within + the B-tree) of the clustered index record to change: perform + the update by delete marking and inserting. + + TODO! What to do to the 'Halloween problem', where an update + moves the record forward in index so that it is again + updated when the cursor arrives there? Solution: the + read operation must check the undo record undo number when + choosing records to update. MySQL solves now the problem + externally! */ + + err = row_upd_clust_rec_by_insert(node, index, thr, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_ALL_SEC; + } else { + err = row_upd_clust_rec(node, index, thr, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_SOME_SEC; + } + + node->index = dict_table_get_next_index(index); + + return(err); +} + +/*************************************************************** +Updates the affected index records of a row. When the control is transferred +to this node, we assume that we have a persistent cursor which was on a +record, and the position of the cursor is stored in the cursor. */ +static +ulint +row_upd( +/*====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + + if (node->in_mysql_interface) { + /* We do not get the cmpl_info value from the MySQL + interpreter: we must calculate it on the fly: */ + + if (row_upd_changes_some_index_ord_field(node->table, + node->update)) { + node->cmpl_info = 0; + } else { + node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; + } + } + + if (node->state == UPD_NODE_UPDATE_CLUSTERED + || node->state == UPD_NODE_INSERT_CLUSTERED) { + + err = row_upd_clust_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + } + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + goto function_exit; + } + + while (node->index != NULL) { + err = row_upd_sec_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->index = dict_table_get_next_index(node->index); + } + +function_exit: + if (err == DB_SUCCESS) { + /* Do some cleanup */ + + if (node->row != NULL) { + mem_heap_empty(node->heap); + node->row = NULL; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + } + + return(err); +} + +/*************************************************************** +Updates a row in a table. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_upd_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + upd_node_t* node; + sel_node_t* sel_node; + que_node_t* parent; + ulint err = DB_SUCCESS; + trx_t* trx; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + node = thr->run_node; + + sel_node = node->select; + + parent = que_node_get_parent(node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + if (thr->prev_node == parent) { + node->state = UPD_NODE_SET_IX_LOCK; + } + + if (node->state == UPD_NODE_SET_IX_LOCK) { + + if (!node->has_clust_rec_x_lock) { + /* It may be that the current session has not yet + started its transaction, or it has been committed: */ + + trx_start_if_not_started(thr_get_trx(thr)); + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + if (node->searched_update) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to update */ + + thr->run_node = sel_node; + + return(thr); + } + } + + /* sel_node is NULL if we are in the MySQL interface */ + + if (sel_node && (sel_node->state != SEL_NODE_FETCH)) { + + if (!node->searched_update) { + /* An explicit cursor should be positioned on a row + to update */ + + ut_error; + + err = DB_ERROR; + + goto error_handling; + } + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to update, or the select node performed the + updates directly in-place */ + + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_upd(node, thr); + +error_handling: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->searched_update) { + /* Fetch next row to update */ + + thr->run_node = sel_node; + } else { + /* It was an explicit cursor update */ + + thr->run_node = parent; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + return(thr); +} + +/************************************************************************* +Performs an in-place update for the current clustered index record in +select. */ + +void +row_upd_in_place_in_select( +/*=======================*/ + sel_node_t* sel_node, /* in: select node */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + upd_node_t* node; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(sel_node->select_will_do_update); + ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF); + ut_ad(sel_node->asc); + + node = que_node_get_parent(sel_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Copy the necessary columns from clust_rec and calculate the new + values to set */ + + row_upd_copy_columns(btr_pcur_get_rec(pcur), + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + + ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + + ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE); + ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); + ut_ad(node->select_will_do_update); + + err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur, + node->update, node->cmpl_info, + thr, mtr); + ut_ad(err == DB_SUCCESS); +} diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c new file mode 100644 index 00000000000..80acc7225df --- /dev/null +++ b/innobase/row/row0vers.c @@ -0,0 +1,409 @@ +/****************************************************** +Row versions + +(c) 1997 Innobase Oy + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0vers.h" + +#ifdef UNIV_NONINL +#include "row0vers.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" + +/********************************************************************* +Finds out if an active transaction has inserted or modified a secondary +index record. NOTE: the kernel mutex is temporarily released in this +function! */ + +trx_t* +row_vers_impl_x_locked_off_kernel( +/*==============================*/ + /* out: NULL if committed, else the active + transaction; NOTE that the kernel mutex is + temporarily released! */ + rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index) /* in: the secondary index */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + rec_t* version; + rec_t* prev_version; + dulint trx_id; + dulint prev_trx_id; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + dtuple_t* entry; + trx_t* trx; + ibool vers_del; + ibool rec_del; + ulint err; + mtr_t mtr; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Search for the clustered index record: this is a time-consuming + operation: therefore we release the kernel mutex; also, the release + is required by the latching order convention. The latch on the + clustered index locks the top of the stack of versions. We also + reserve purge_latch to lock the bottom of the version stack. */ + + clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, + &clust_index, &mtr); + ut_a(clust_rec); + + trx_id = row_get_rec_trx_id(clust_rec, clust_index); + + mtr_s_lock(&(purge_sys->latch), &mtr); + + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* The transaction that modified or inserted clust_rec is no + longer active: no implicit lock on rec */ + + mtr_commit(&mtr); + + return(NULL); + } + + /* We look up if some earlier version of the clustered index record + would require rec to be in a different state (delete marked or + unmarked, or not existing). If there is such a version, then rec was + modified by the trx_id transaction, and it has an implicit x-lock on + rec. Note that if clust_rec itself would require rec to be in a + different state, then the trx_id transaction has not yet had time to + modify rec, and does not necessarily have an implicit x-lock on rec. */ + + rec_del = rec_get_deleted_flag(rec); + trx = NULL; + + version = clust_rec; + heap = NULL; + + for (;;) { + mutex_exit(&kernel_mutex); + + /* While we retrieve an earlier version of clust_rec, we + release the kernel mutex, because it may take time to access + the disk. After the release, we have to check if the trx_id + transaction is still active. We keep the semaphore in mtr on + the clust_rec page, so that no other transaction can update + it and get an implicit x-lock on rec. */ + + heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(clust_rec, &mtr, version, + clust_index, heap, + &prev_version); + if (heap2) { + mem_heap_free(heap2); /* version was stored in heap2, + if heap2 != NULL */ + } + + if (prev_version) { + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, heap); + entry = row_build_index_entry(row, index, heap); + } + + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* Transaction no longer active: no implicit x-lock */ + + break; + } + + /* If the transaction is still active, the previous version + of clust_rec must be accessible if not a fresh insert; we + may assert the following: */ + + ut_ad(err == DB_SUCCESS); + + if (prev_version == NULL) { + /* It was a freshly inserted version: there is an + implicit x-lock on rec */ + + trx = trx_get_on_id(trx_id); + + break; + } + + /* If we get here, we know that the trx_id transaction is + still active and it has modified prev_version. Let us check + if prev_version would require rec to be in a different state. */ + + vers_del = rec_get_deleted_flag(prev_version); + + if (0 == cmp_dtuple_rec(entry, rec)) { + /* The delete marks of rec and prev_version should be + equal for rec to be in the state required by + prev_version */ + + if (rec_del != vers_del) { + trx = trx_get_on_id(trx_id); + + break; + } + } else if (!rec_del) { + /* The delete mark should be set in rec for it to be + in the state required by prev_version */ + + trx = trx_get_on_id(trx_id); + + break; + } + + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index); + + if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { + /* The versions modified by the trx_id transaction end + to prev_version: no implicit x-lock */ + + break; + } + + version = prev_version; + }/* for (;;) */ + + mtr_commit(&mtr); + mem_heap_free(heap); + + return(trx); +} + +/********************************************************************* +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. */ + +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + /* out: TRUE if earlier version should be preserved */ + dulint trx_id, /* in: transaction id in the version */ + mtr_t* mtr) /* in: mtr holding the latch on the clustered index + record; it will also hold the latch on purge_view */ +{ + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); + + mtr_s_lock(&(purge_sys->latch), mtr); + + if (trx_purge_update_undo_must_exist(trx_id)) { + + /* A purge operation is not yet allowed to remove this + delete marked record */ + + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************* +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. */ + +ibool +row_vers_old_has_index_entry( +/*=========================*/ + /* out: TRUE if earlier version should have */ + ibool also_curr,/* in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + rec_t* rec, /* in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /* in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /* in: the secondary index */ + dtuple_t* ientry) /* in: the secondary index entry */ +{ + rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + dtuple_t* entry; + ulint err; + + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_S_FIX)); + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); + mtr_s_lock(&(purge_sys->latch), mtr); + + clust_index = dict_table_get_first_index(index->table); + + if (also_curr && !rec_get_deleted_flag(rec)) { + + heap = mem_heap_create(1024); + row = row_build(ROW_COPY_POINTERS, clust_index, rec, heap); + entry = row_build_index_entry(row, index, heap); + + if (dtuple_datas_are_equal(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + + mem_heap_free(heap); + } + + version = rec; + heap = NULL; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(rec, mtr, version, + clust_index, heap, + &prev_version); + if (heap2) { + mem_heap_free(heap2); /* version was stored in heap2, + if heap2 != NULL */ + } + + if ((err != DB_SUCCESS) || !prev_version) { + /* Versions end here */ + + mem_heap_free(heap); + + return(FALSE); + } + + if (!rec_get_deleted_flag(prev_version)) { + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, heap); + entry = row_build_index_entry(row, index, heap); + + if (dtuple_datas_are_equal(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = prev_version; + } +} + +/********************************************************************* +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. */ + +ulint +row_vers_build_for_consistent_read( +/*===============================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + read_view_t* view, /* in: the consistent read view */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/* out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + rec_t* version; + rec_t* prev_version; + dulint prev_trx_id; + mem_heap_t* heap; + mem_heap_t* heap2; + byte* buf; + ulint err; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_S_FIX)); + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); + ut_ad(!read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))); + + rw_lock_s_lock(&(purge_sys->latch)); + version = rec; + heap = NULL; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(rec, mtr, version, index, + heap, &prev_version); + if (heap2) { + mem_heap_free(heap2); /* version was stored in heap2, + if heap2 != NULL */ + } + + if (err != DB_SUCCESS) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + prev_trx_id = row_get_rec_trx_id(prev_version, index); + + if (read_view_sees_trx_id(view, prev_trx_id)) { + + /* The view already sees this version: we can copy + it to in_heap and return */ + + buf = mem_heap_alloc(in_heap, rec_get_size( + prev_version)); + *old_vers = rec_copy(buf, prev_version); + err = DB_SUCCESS; + + break; + } + + version = prev_version; + }/* for (;;) */ + + mem_heap_free(heap); + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} diff --git a/innobase/row/ts/makefile b/innobase/row/ts/makefile new file mode 100644 index 00000000000..589db50d4ed --- /dev/null +++ b/innobase/row/ts/makefile @@ -0,0 +1,16 @@ + + + +include ..\..\makefile.i + +tstcur: ..\tcur.lib tstcur.c + $(CCOM) $(CFL) -I.. -I..\.. ..\tcur.lib ..\..\trx.lib ..\..\btr.lib ..\..\fut.lib ..\..\fsp.lib ..\..\page.lib ..\..\dyn.lib ..\..\mtr.lib ..\..\log.lib ..\..\rem.lib ..\..\fil.lib ..\..\buf.lib ..\..\dict.lib ..\..\data.lib ..\..\mach.lib ..\..\ha.lib ..\..\ut.lib ..\..\sync.lib ..\..\mem.lib ..\..\os.lib tstcur.c $(LFL) + + + + + + + + + diff --git a/innobase/row/ts/tstcur.c b/innobase/row/ts/tstcur.c new file mode 100644 index 00000000000..f5a5eb1f9f3 --- /dev/null +++ b/innobase/row/ts/tstcur.c @@ -0,0 +1,1087 @@ +/************************************************************************ +Test for the index system + +(c) 1994-1996 Innobase Oy + +Created 2/16/1996 Heikki Tuuri +*************************************************************************/ + +#include "sync0sync.h" +#include "ut0mem.h" +#include "mem0mem.h" +#include "data0data.h" +#include "data0type.h" +#include "dict0dict.h" +#include "buf0buf.h" +#include "os0file.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "mtr0mtr.h" +#include "log0log.h" +#include "page0page.h" +#include "page0cur.h" +#include "trx0trx.h" +#include "dict0boot.h" +#include "trx0sys.h" +#include "dict0crea.h" +#include "btr0btr.h" +#include "btr0pcur.h" +#include "rem0rec.h" +#include "..\tcur0ins.h" + +os_file_t files[1000]; + +mutex_t ios_mutex; +ulint ios; +ulint n[10]; + +mutex_t incs_mutex; +ulint incs; + +byte bigbuf[1000000]; + +#define N_SPACES 1 +#define N_FILES 1 +#define FILE_SIZE 4000 /* must be > 512 */ +#define POOL_SIZE 1000 +#define COUNTER_OFFSET 1500 + +#define LOOP_SIZE 150 +#define N_THREADS 5 + + +ulint zero = 0; + +buf_block_t* bl_arr[POOL_SIZE]; + +/************************************************************************ +Io-handler thread function. */ + +ulint +handler_thread( +/*===========*/ + void* arg) +{ + ulint segment; + void* mess; + ulint i; + bool ret; + + segment = *((ulint*)arg); + + printf("Io handler thread %lu starts\n", segment); + + for (i = 0;; i++) { + ret = fil_aio_wait(segment, &mess); + ut_a(ret); + + buf_page_io_complete((buf_block_t*)mess); + + mutex_enter(&ios_mutex); + ios++; + mutex_exit(&ios_mutex); + + } + + return(0); +} + +/************************************************************************* +Creates the files for the file system test and inserts them to +the file system. */ + +void +create_files(void) +/*==============*/ +{ + bool ret; + ulint i, k; + char name[20]; + os_thread_t thr[5]; + os_thread_id_t id[5]; + + printf("--------------------------------------------------------\n"); + printf("Create or open database files\n"); + + strcpy(name, "tsfile00"); + + for (k = 0; k < N_SPACES; k++) { + for (i = 0; i < N_FILES; i++) { + + name[6] = (char)((ulint)'0' + k); + name[7] = (char)((ulint)'0' + i); + + files[i] = os_file_create(name, OS_FILE_CREATE, + OS_FILE_TABLESPACE, &ret); + + if (ret == FALSE) { + ut_a(os_file_get_last_error() == + OS_FILE_ALREADY_EXISTS); + + files[i] = os_file_create( + name, OS_FILE_OPEN, + OS_FILE_TABLESPACE, &ret); + + ut_a(ret); + } + + ret = os_file_close(files[i]); + ut_a(ret); + + if (i == 0) { + fil_space_create(name, k, OS_FILE_TABLESPACE); + } + + ut_a(fil_validate()); + + fil_node_create(name, FILE_SIZE, k); + } + } + + ios = 0; + + mutex_create(&ios_mutex); + + for (i = 0; i < 5; i++) { + n[i] = i; + + thr[i] = os_thread_create(handler_thread, n + i, id + i); + } +} + +/************************************************************************ +Inits space header of space 0. */ + +void +init_space(void) +/*============*/ +{ + mtr_t mtr; + + printf("Init space header\n"); + + mtr_start(&mtr); + + fsp_header_init(0, FILE_SIZE * N_FILES, &mtr); + + mtr_commit(&mtr); +} + +/********************************************************************* +Test for index page. */ + +void +test1(void) +/*=======*/ +{ + dtuple_t* tuple; + mem_heap_t* heap; + mem_heap_t* heap2; + ulint rnd = 0; + dict_index_t* index; + dict_table_t* table; + byte buf[16]; + ulint i, j; + ulint tm, oldtm; + trx_t* trx; +/* dict_tree_t* tree;*/ + btr_pcur_t pcur; + btr_pcur_t pcur2; + mtr_t mtr; + mtr_t mtr2; + byte* field; + ulint len; + dtuple_t* search_tuple; + dict_tree_t* index_tree; + rec_t* rec; + + UT_NOT_USED(len); + UT_NOT_USED(field); + UT_NOT_USED(pcur2); +/* + printf("\n\n\nPress 2 x enter to start test\n"); + + while (EOF == getchar()) { + + } + + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 1. CREATE TABLE WITH 3 COLUMNS AND WITH 3 INDEXES\n"); + + heap = mem_heap_create(1024); + heap2 = mem_heap_create(1024); + + trx = trx_start(ULINT_UNDEFINED); + + table = dict_mem_table_create("TS_TABLE1", 0, 3); + + dict_mem_table_add_col(table, "COL1", DATA_VARCHAR, + DATA_ENGLISH, 10, 0); + dict_mem_table_add_col(table, "COL2", DATA_VARCHAR, + DATA_ENGLISH, 10, 0); + dict_mem_table_add_col(table, "COL3", DATA_VARCHAR, + DATA_ENGLISH, 100, 0); + + ut_a(TRUE == dict_create_table(table, trx)); + + index = dict_mem_index_create("TS_TABLE1", "IND1", 75046, + DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "COL1", 0); + dict_mem_index_add_field(index, "COL2", 0); + + ut_a(mem_heap_validate(index->heap)); + + ut_a(TRUE == dict_create_index(index, trx)); + + trx_commit(trx); + + trx = trx_start(ULINT_UNDEFINED); + + index = dict_mem_index_create("TS_TABLE1", "IND2", 0, DICT_UNIQUE, 1); + + dict_mem_index_add_field(index, "COL2", 0); + + ut_a(mem_heap_validate(index->heap)); + + ut_a(TRUE == dict_create_index(index, trx)); + + trx_commit(trx); + + trx = trx_start(ULINT_UNDEFINED); + + index = dict_mem_index_create("TS_TABLE1", "IND3", 0, DICT_UNIQUE, 1); + + dict_mem_index_add_field(index, "COL2", 0); + + ut_a(mem_heap_validate(index->heap)); + + ut_a(TRUE == dict_create_index(index, trx)); + + trx_commit(trx); +/* + tree = dict_index_get_tree(dict_table_get_first_index(table)); + + btr_print_tree(tree, 10); +*/ + dict_table_print(table); + + /*---------------------------------------------------------*/ +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 2. INSERT 1 ROW TO THE TABLE\n"); + + trx = trx_start(ULINT_UNDEFINED); + + tuple = dtuple_create(heap, 3); + + table = dict_table_get("TS_TABLE1", trx); + + dtuple_gen_test_tuple3(tuple, 0, buf); + tcur_insert(tuple, table, heap2, trx); + + trx_commit(trx); +/* + tree = dict_index_get_tree(dict_table_get_first_index(table)); + + btr_print_tree(tree, 10); +*/ +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 3. INSERT MANY ROWS TO THE TABLE IN A SINGLE TRX\n"); + + rnd = 0; + oldtm = ut_clock(); + + trx = trx_start(ULINT_UNDEFINED); + for (i = 0; i < 300 * UNIV_DBC * UNIV_DBC; i++) { + + if (i % 5000 == 0) { + /* dict_table_print(table); + buf_print(); + buf_LRU_print(); + printf("%lu rows inserted\n", i); */ + } + + table = dict_table_get("TS_TABLE1", trx); + + if (i == 2180) { + rnd = rnd % 200000; + } + + rnd = (rnd + 1) % 200000; + + dtuple_gen_test_tuple3(tuple, rnd, buf); + + tcur_insert(tuple, table, heap2, trx); + + mem_heap_empty(heap2); + + if (i % 4 == 3) { + } + } + trx_commit(trx); + + tm = ut_clock(); + printf("Wall time for test %lu milliseconds\n", tm - oldtm); + printf("%lu rows inserted\n", i); +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 4. PRINT PART OF CONTENTS OF EACH INDEX TREE\n"); + +/* + mem_print_info(); +*/ + +/* + tree = dict_index_get_tree(dict_table_get_first_index(table)); + + btr_print_tree(tree, 10); + + tree = dict_index_get_tree(dict_table_get_next_index( + dict_table_get_first_index(table))); + + btr_print_tree(tree, 5); +*/ +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ +/* mem_print_info(); */ + + os_thread_sleep(5000000); + + for (j = 0; j < 5; j++) { + printf("-------------------------------------------------\n"); + printf("TEST 5. CALCULATE THE JOIN OF THE TABLE WITH ITSELF\n"); + + i = 0; + + oldtm = ut_clock(); + + mtr_start(&mtr); + + index_tree = dict_index_get_tree(UT_LIST_GET_FIRST(table->indexes)); + + search_tuple = dtuple_create(heap, 2); + + dtuple_gen_search_tuple3(search_tuple, i, buf); + + btr_pcur_open(index_tree, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + ut_a(btr_pcur_move_to_next(&pcur, &mtr)); + + while (!btr_pcur_is_after_last_in_tree(&pcur, &mtr)) { + + if (i % 20000 == 0) { + printf("%lu rows joined\n", i); + } + + index_tree = dict_index_get_tree( + UT_LIST_GET_FIRST(table->indexes)); + + rec = btr_pcur_get_rec(&pcur); + + rec_copy_prefix_to_dtuple(search_tuple, rec, 2, heap2); + + mtr_start(&mtr2); + + btr_pcur_open(index_tree, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur2, &mtr2); + + btr_pcur_move_to_next(&pcur2, &mtr2); + + rec = btr_pcur_get_rec(&pcur2); + + field = rec_get_nth_field(rec, 1, &len); + + ut_a(len == 8); + + ut_a(ut_memcmp(field, dfield_get_data( + dtuple_get_nth_field(search_tuple, 1)), + len) == 0); + + btr_pcur_close(&pcur2, &mtr); + + mem_heap_empty(heap2); + + mtr_commit(&mtr2); + + btr_pcur_store_position(&pcur, &mtr); + mtr_commit(&mtr); + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + + btr_pcur_move_to_next(&pcur, &mtr); + i++; + } + + btr_pcur_close(&pcur, &mtr); + mtr_commit(&mtr); + + tm = ut_clock(); + printf("Wall time for test %lu milliseconds\n", tm - oldtm); + printf("%lu rows joined\n", i); + } + + oldtm = ut_clock(); + +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 6. INSERT MANY ROWS TO THE TABLE IN SEPARATE TRXS\n"); + + rnd = 200000; + + for (i = 0; i < 350; i++) { + + if (i % 4 == 0) { + } + trx = trx_start(ULINT_UNDEFINED); + + table = dict_table_get("TS_TABLE1", trx); + + if (i == 2180) { + rnd = rnd % 200000; + } + + rnd = (rnd + 1) % 200000; + + dtuple_gen_test_tuple3(tuple, rnd, buf); + + tcur_insert(tuple, table, heap2, trx); + + trx_commit(trx); + + mem_heap_empty(heap2); + if (i % 4 == 3) { + } + } + + tm = ut_clock(); + printf("Wall time for test %lu milliseconds\n", tm - oldtm); + printf("%lu rows inserted in %lu transactions\n", i, i); +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 7. PRINT MEMORY ALLOCATION INFO\n"); + + mem_print_info(); +/* + printf("\n\n\nPress 2 x enter to continue test\n"); + + while (EOF == getchar()) { + + } + getchar(); +*/ + printf("-------------------------------------------------\n"); + printf("TEST 8. PRINT SEMAPHORE INFO\n"); + + sync_print(); + + + +#ifdef notdefined + rnd = 90000; + + oldtm = ut_clock(); + + for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) { + + mtr_start(&mtr); + + if (i == 50000) { + rnd = rnd % 200000; + } + + rnd = (rnd + 595659561) % 200000; + + dtuple_gen_test_tuple3(tuple, rnd, buf); + + btr_pcur_open(tree, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &cursor, &mtr); + + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf("Wall time for test %lu milliseconds\n", tm - oldtm); + + rnd = 0; + + oldtm = ut_clock(); + + for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) { + + mtr_start(&mtr); + + rnd = (rnd + 35608971) % 200000 + 1; + + dtuple_gen_test_tuple3(tuple, rnd, buf); + + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf("Wall time for test %lu milliseconds\n", tm - oldtm); + +/* btr_print_tree(tree, 3); */ + +#endif + mem_heap_free(heap); +} + + +#ifdef notdefined + + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + for (i = 0; i < 512; i++) { + + rnd = (rnd + 534671) % 512; + + if (i % 27 == 0) { + ut_a(page_validate(page, index)); + } + + dtuple_gen_test_tuple(tuple, rnd); + +/* dtuple_print(tuple);*/ + + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + + ut_a(rec); + + rec_validate(rec); +/* page_print_list(page, 151); */ + } + +/* page_print_list(page, 151); */ + + ut_a(page_validate(page, index)); + ut_a(page_get_n_recs(page) == 512); + + for (i = 0; i < 512; i++) { + + rnd = (rnd + 7771) % 512; + + if (i % 27 == 0) { + ut_a(page_validate(page, index)); + } + + dtuple_gen_test_tuple(tuple, rnd); + +/* dtuple_print(tuple);*/ + + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + page_cur_delete_rec(&cursor, &mtr); + + ut_a(rec); + + rec_validate(rec); +/* page_print_list(page, 151); */ + } + + ut_a(page_get_n_recs(page) == 0); + + ut_a(page_validate(page, index)); + page = page_create(frame, &mtr); + + rnd = 311; + + for (i = 0; i < 512; i++) { + + rnd = (rnd + 1) % 512; + + if (i % 27 == 0) { + ut_a(page_validate(page, index)); + } + + dtuple_gen_test_tuple(tuple, rnd); + +/* dtuple_print(tuple);*/ + + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + + ut_a(rec); + + rec_validate(rec); +/* page_print_list(page, 151); */ + } + + ut_a(page_validate(page, index)); + ut_a(page_get_n_recs(page) == 512); + + rnd = 217; + + for (i = 0; i < 512; i++) { + + rnd = (rnd + 1) % 512; + + if (i % 27 == 0) { + ut_a(page_validate(page, index)); + } + + dtuple_gen_test_tuple(tuple, rnd); + +/* dtuple_print(tuple);*/ + + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + page_cur_delete_rec(&cursor, &mtr); + + ut_a(rec); + + rec_validate(rec); +/* page_print_list(page, 151); */ + } + + ut_a(page_validate(page, index)); + ut_a(page_get_n_recs(page) == 0); + page = page_create(frame, &mtr); + + rnd = 291; + + for (i = 0; i < 512; i++) { + + rnd = (rnd - 1) % 512; + + if (i % 27 == 0) { + ut_a(page_validate(page, index)); + } + + dtuple_gen_test_tuple(tuple, rnd); + +/* dtuple_print(tuple);*/ + + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + + ut_a(rec); + + rec_validate(rec); +/* page_print_list(page, 151); */ + } + + ut_a(page_validate(page, index)); + ut_a(page_get_n_recs(page) == 512); + + rnd = 277; + + for (i = 0; i < 512; i++) { + + rnd = (rnd - 1) % 512; + + if (i % 27 == 0) { + ut_a(page_validate(page, index)); + } + + dtuple_gen_test_tuple(tuple, rnd); + +/* dtuple_print(tuple);*/ + + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + page_cur_delete_rec(&cursor, &mtr); + + ut_a(rec); + + rec_validate(rec); +/* page_print_list(page, 151); */ + } + + ut_a(page_validate(page, index)); + ut_a(page_get_n_recs(page) == 0); + + mtr_commit(&mtr); + mem_heap_free(heap); +} + +/********************************************************************* +Test for index page. */ + +void +test2(void) +/*=======*/ +{ + page_t* page; + dtuple_t* tuple; + mem_heap_t* heap; + ulint i, j; + ulint rnd = 0; + rec_t* rec; + page_cur_t cursor; + dict_index_t* index; + dict_table_t* table; + buf_block_t* block; + buf_frame_t* frame; + ulint tm, oldtm; + byte buf[8]; + mtr_t mtr; + + printf("-------------------------------------------------\n"); + printf("TEST 2. Speed test\n"); + + oldtm = ut_clock(); + + for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) { + ut_memcpy(bigbuf, bigbuf + 800, 800); + } + + tm = ut_clock(); + printf("Wall time for %lu mem copys of 800 bytes %lu millisecs\n", + i, tm - oldtm); + + oldtm = ut_clock(); + + rnd = 0; + for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) { + ut_memcpy(bigbuf + rnd, bigbuf + rnd + 800, 800); + rnd += 1600; + if (rnd > 995000) { + rnd = 0; + } + } + + tm = ut_clock(); + printf("Wall time for %lu mem copys of 800 bytes %lu millisecs\n", + i, tm - oldtm); + + heap = mem_heap_create(0); + + table = dict_table_create("TS_TABLE2", 2); + + dict_table_add_col(table, "COL1", DATA_VARCHAR, DATA_ENGLISH, 10, 0); + dict_table_add_col(table, "COL2", DATA_VARCHAR, DATA_ENGLISH, 10, 0); + + ut_a(0 == dict_table_publish(table)); + + index = dict_index_create("TS_TABLE2", "IND2", 0, 2, 0); + + dict_index_add_field(index, "COL1", 0); + dict_index_add_field(index, "COL2", 0); + + ut_a(0 == dict_index_publish(index)); + + index = dict_index_get("TS_TABLE2", "IND2"); + ut_a(index); + + tuple = dtuple_create(heap, 2); + + oldtm = ut_clock(); + + rnd = 677; + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + ut_a(rec); + } + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf("Wall time for insertion of %lu recs %lu milliseconds\n", + i * j, tm - oldtm); + + mtr_start(&mtr); + + block = buf_page_get(0, 5, &mtr); + buf_page_s_lock(block, &mtr); + + page = buf_block_get_frame(block); + ut_a(page_validate(page, index)); + mtr_commit(&mtr); + + oldtm = ut_clock(); + + rnd = 677; + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + } + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf( + "Wall time for %lu empty loops with page create %lu milliseconds\n", + i * j, tm - oldtm); + + oldtm = ut_clock(); + + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + rnd = 100; + for (j = 0; j < 250; j++) { + rnd = (rnd + 1) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + ut_a(rec); + } + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf( + "Wall time for sequential insertion of %lu recs %lu milliseconds\n", + i * j, tm - oldtm); + + + oldtm = ut_clock(); + + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + rnd = 500; + for (j = 0; j < 250; j++) { + rnd = (rnd - 1) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + ut_a(rec); + } + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf( + "Wall time for descend. seq. insertion of %lu recs %lu milliseconds\n", + i * j, tm - oldtm); + + oldtm = ut_clock(); + + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + rnd = 677; + + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + ut_a(rec); + } + + rnd = 677; + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + page_cur_delete_rec(&cursor, &mtr); + } + ut_a(page_get_n_recs(page) == 0); + + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf("Wall time for insert and delete of %lu recs %lu milliseconds\n", + i * j, tm - oldtm); + + mtr_start(&mtr); + + block = buf_page_create(0, 5, &mtr); + buf_page_x_lock(block, &mtr); + + frame = buf_block_get_frame(block); + + page = page_create(frame, &mtr); + + rnd = 677; + + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + + rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr); + ut_a(rec); + } + ut_a(page_validate(page, index)); + mtr_print(&mtr); + + oldtm = ut_clock(); + + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + rnd = 677; + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + page_cur_search(page, tuple, PAGE_CUR_G, &cursor); + } + } + + tm = ut_clock(); + printf("Wall time for search of %lu recs %lu milliseconds\n", + i * j, tm - oldtm); + + oldtm = ut_clock(); + + for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) { + rnd = 677; + for (j = 0; j < 250; j++) { + rnd = (rnd + 54841) % 1000; + dtuple_gen_test_tuple2(tuple, rnd, buf); + } + } + + tm = ut_clock(); + printf("Wall time for %lu empty loops %lu milliseconds\n", + i * j, tm - oldtm); + mtr_commit(&mtr); +} + +#endif + +/******************************************************************** +Main test function. */ + +void +main(void) +/*======*/ +{ + ulint tm, oldtm; + mtr_t mtr; + + sync_init(); + mem_init(); + os_aio_init(160, 5); + fil_init(25); + buf_pool_init(POOL_SIZE, POOL_SIZE); + fsp_init(); + log_init(); + + create_files(); + init_space(); + + mtr_start(&mtr); + + trx_sys_create(&mtr); + dict_create(&mtr); + + mtr_commit(&mtr); + + + oldtm = ut_clock(); + + ut_rnd_set_seed(19); + + test1(); + +/* mem_print_info(); */ + + tm = ut_clock(); + printf("Wall time for test %lu milliseconds\n", tm - oldtm); + printf("TESTS COMPLETED SUCCESSFULLY!\n"); +} |