diff options
author | unknown <brian@zim.(none)> | 2005-04-26 19:07:13 -0700 |
---|---|---|
committer | unknown <brian@zim.(none)> | 2005-04-26 19:07:13 -0700 |
commit | a9da10f7a8e626aac0bd3a8d82b20c7b864c7061 (patch) | |
tree | 7028a9aade64c6143da00a4301627ce5d262c0f0 /storage/innobase/row | |
parent | a2ed27af5291f778c1fc6e23cf4edc1bc36f0bed (diff) | |
parent | 25311ea4a5f83652959a0744d99a4eb51aa9d328 (diff) | |
download | mariadb-git-a9da10f7a8e626aac0bd3a8d82b20c7b864c7061.tar.gz |
Merge baker@bk-internal.mysql.com:/home/bk/mysql-5.1
into zim.(none):/home/brian/mysql/mysql-5.1
configure.in:
Auto merged
mysql-test/mysql-test-run.sh:
Auto merged
sql/Makefile.am:
Auto merged
sql/ha_innodb.cc:
Auto merged
sql/ha_myisam.cc:
Auto merged
sql/lock.cc:
Auto merged
storage/innobase/btr/btr0pcur.c:
Auto merged
storage/innobase/dict/dict0dict.c:
Auto merged
storage/innobase/dict/dict0load.c:
Auto merged
storage/innobase/fil/fil0fil.c:
Auto merged
storage/innobase/include/btr0pcur.h:
Auto merged
storage/innobase/include/btr0pcur.ic:
Auto merged
storage/innobase/include/dict0dict.h:
Auto merged
storage/innobase/include/dict0load.h:
Auto merged
storage/innobase/include/os0file.h:
Auto merged
storage/innobase/include/srv0srv.h:
Auto merged
storage/innobase/log/log0log.c:
Auto merged
storage/innobase/os/os0file.c:
Auto merged
storage/innobase/row/row0ins.c:
Auto merged
storage/innobase/row/row0mysql.c:
Auto merged
storage/innobase/row/row0sel.c:
Auto merged
storage/innobase/srv/srv0srv.c:
Auto merged
storage/innobase/srv/srv0start.c:
Auto merged
storage/myisam/mi_check.c:
Auto merged
storage/myisam/mi_create.c:
Auto merged
storage/myisam/mi_dynrec.c:
Auto merged
storage/myisam/mi_search.c:
Auto merged
storage/myisam/mi_write.c:
Auto merged
storage/myisam/myisamdef.h:
Auto merged
storage/myisam/myisampack.c:
Auto merged
storage/myisammrg/myrg_create.c:
Auto merged
storage/ndb/include/kernel/signaldata/BackupImpl.hpp:
Auto merged
storage/ndb/include/kernel/signaldata/BackupSignalData.hpp:
Auto merged
storage/ndb/include/kernel/signaldata/TuxMaint.hpp:
Auto merged
storage/ndb/include/mgmapi/mgmapi_config_parameters.h:
Auto merged
storage/ndb/include/ndbapi/NdbScanOperation.hpp:
Auto merged
storage/ndb/include/ndbapi/NdbTransaction.hpp:
Auto merged
storage/ndb/src/common/debugger/signaldata/BackupImpl.cpp:
Auto merged
storage/ndb/src/kernel/blocks/ERROR_codes.txt:
Auto merged
storage/ndb/src/kernel/blocks/backup/Backup.cpp:
Auto merged
storage/ndb/src/kernel/blocks/backup/Backup.hpp:
Auto merged
storage/ndb/src/kernel/blocks/backup/Backup.txt:
Auto merged
storage/ndb/src/kernel/blocks/backup/BackupInit.cpp:
Auto merged
storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbacc/Dbacc.hpp:
Auto merged
storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp:
Auto merged
storage/ndb/src/kernel/main.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp:
Auto merged
storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp:
Auto merged
storage/ndb/src/kernel/blocks/dbtup/DbtupRoutines.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbtup/DbtupTrigger.cpp:
Auto merged
storage/ndb/src/kernel/blocks/dbtup/Notes.txt:
Auto merged
storage/ndb/src/kernel/blocks/dbtux/DbtuxNode.cpp:
Auto merged
storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
Auto merged
storage/ndb/src/kernel/blocks/ndbfs/OpenFiles.hpp:
Auto merged
storage/ndb/src/mgmapi/mgmapi.cpp:
Auto merged
storage/ndb/src/mgmsrv/ConfigInfo.cpp:
Auto merged
storage/ndb/src/mgmsrv/MgmtSrvr.cpp:
Auto merged
storage/ndb/src/mgmsrv/MgmtSrvr.hpp:
Auto merged
storage/ndb/src/mgmsrv/MgmtSrvrGeneralSignalHandling.cpp:
Auto merged
storage/ndb/src/ndbapi/Ndb.cpp:
Auto merged
storage/ndb/src/ndbapi/NdbDictionaryImpl.cpp:
Auto merged
storage/ndb/src/ndbapi/NdbIndexOperation.cpp:
Auto merged
storage/ndb/src/ndbapi/NdbOperationDefine.cpp:
Auto merged
storage/ndb/src/ndbapi/NdbOperationSearch.cpp:
Auto merged
storage/ndb/src/ndbapi/NdbScanOperation.cpp:
Auto merged
storage/ndb/src/ndbapi/NdbTransaction.cpp:
Auto merged
storage/ndb/src/ndbapi/Ndbif.cpp:
Auto merged
storage/ndb/src/ndbapi/Ndblist.cpp:
Auto merged
storage/ndb/src/ndbapi/TransporterFacade.cpp:
Auto merged
storage/ndb/src/ndbapi/ndberror.c:
Auto merged
storage/ndb/test/ndbapi/Makefile.am:
Auto merged
storage/ndb/test/ndbapi/testBackup.cpp:
Auto merged
storage/ndb/test/ndbapi/testIndex.cpp:
Auto merged
storage/ndb/test/ndbapi/testNodeRestart.cpp:
Auto merged
storage/ndb/test/ndbapi/testOIBasic.cpp:
Auto merged
storage/ndb/test/ndbapi/testOperations.cpp:
Auto merged
storage/ndb/test/run-test/daily-basic-tests.txt:
Auto merged
storage/ndb/test/run-test/daily-devel-tests.txt:
Auto merged
storage/ndb/test/src/NdbBackup.cpp:
Auto merged
storage/ndb/tools/desc.cpp:
Auto merged
Diffstat (limited to 'storage/innobase/row')
-rw-r--r-- | storage/innobase/row/Makefile.am | 25 | ||||
-rw-r--r-- | storage/innobase/row/makefilewin | 34 | ||||
-rw-r--r-- | storage/innobase/row/row0ins.c | 2424 | ||||
-rw-r--r-- | storage/innobase/row/row0mysql.c | 4085 | ||||
-rw-r--r-- | storage/innobase/row/row0purge.c | 671 | ||||
-rw-r--r-- | storage/innobase/row/row0row.c | 730 | ||||
-rw-r--r-- | storage/innobase/row/row0sel.c | 4066 | ||||
-rw-r--r-- | storage/innobase/row/row0uins.c | 308 | ||||
-rw-r--r-- | storage/innobase/row/row0umod.c | 773 | ||||
-rw-r--r-- | storage/innobase/row/row0undo.c | 350 | ||||
-rw-r--r-- | storage/innobase/row/row0upd.c | 2035 | ||||
-rw-r--r-- | storage/innobase/row/row0vers.c | 492 |
12 files changed, 15993 insertions, 0 deletions
diff --git a/storage/innobase/row/Makefile.am b/storage/innobase/row/Makefile.am new file mode 100644 index 00000000000..bd09f9a237d --- /dev/null +++ b/storage/innobase/row/Makefile.am @@ -0,0 +1,25 @@ +# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB +# & Innobase Oy +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +include ../include/Makefile.i + +noinst_LIBRARIES = librow.a + +librow_a_SOURCES = row0ins.c row0mysql.c row0purge.c row0row.c row0sel.c\ + row0uins.c row0umod.c row0undo.c row0upd.c row0vers.c + +EXTRA_PROGRAMS = diff --git a/storage/innobase/row/makefilewin b/storage/innobase/row/makefilewin new file mode 100644 index 00000000000..c17240c6119 --- /dev/null +++ b/storage/innobase/row/makefilewin @@ -0,0 +1,34 @@ +include ..\include\makefile.i + +row.lib: row0mysql.obj row0upd.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj + lib -out:..\libs\row.lib row0mysql.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj + +row0mysql.obj: row0mysql.c + $(CCOM) $(CFL) -c row0mysql.c + +row0ins.obj: row0ins.c + $(CCOM) $(CFL) -c row0ins.c + +row0sel.obj: row0sel.c + $(CCOM) $(CFL) -c row0sel.c + +row0upd.obj: row0upd.c + $(CCOM) $(CFL) -c row0upd.c + +row0undo.obj: row0undo.c + $(CCOM) $(CFL) -c row0undo.c + +row0purge.obj: row0purge.c + $(CCOM) $(CFL) -c row0purge.c + +row0row.obj: row0row.c + $(CCOM) $(CFL) -c row0row.c + +row0vers.obj: row0vers.c + $(CCOM) $(CFL) -c row0vers.c + +row0umod.obj: row0umod.c + $(CCOM) $(CFL) -c row0umod.c + +row0uins.obj: row0uins.c + $(CCOM) $(CFL) -c row0uins.c diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c new file mode 100644 index 00000000000..303fe5749bc --- /dev/null +++ b/storage/innobase/row/row0ins.c @@ -0,0 +1,2424 @@ +/****************************************************** +Insert into a table + +(c) 1996 Innobase Oy + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0ins.h" + +#ifdef UNIV_NONINL +#include "row0ins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "eval0eval.h" +#include "data0data.h" +#include "usr0sess.h" + +#define ROW_INS_PREV 1 +#define ROW_INS_NEXT 2 + + +/********************************************************************* +This prototype is copied from /mysql/sql/ha_innodb.cc. +Invalidates the MySQL query cache for the table. +NOTE that the exact prototype of this function has to be in +/innobase/row/row0ins.c! */ +extern +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /* in: transaction which modifies the table */ + char* full_name, /* in: concatenation of database name, null + char '\0', table name, null char'\0'; + NOTE that in Windows this is always + in LOWER CASE! */ + ulint full_name_len); /* in: full name length where also the null + chars count */ + +/********************************************************************** +This function returns true if + +1) SQL-query in the current thread +is either REPLACE or LOAD DATA INFILE REPLACE. + +2) SQL-query in the current thread +is INSERT ON DUPLICATE KEY UPDATE. + +NOTE that /mysql/innobase/row/row0ins.c must contain the +prototype for this function ! */ + +ibool +innobase_query_is_update(void); + +/************************************************************************* +Creates an insert node struct. */ + +ins_node_t* +ins_node_create( +/*============*/ + /* out, own: insert node struct */ + ulint ins_type, /* in: INS_VALUES, ... */ + dict_table_t* table, /* in: table where to insert */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + ins_node_t* node; + + node = mem_heap_alloc(heap, sizeof(ins_node_t)); + + node->common.type = QUE_NODE_INSERT; + + node->ins_type = ins_type; + + node->state = INS_NODE_SET_IX_LOCK; + node->table = table; + node->index = NULL; + node->entry = NULL; + + node->select = NULL; + + node->trx_id = ut_dulint_zero; + + node->entry_sys_heap = mem_heap_create(128); + + node->magic_n = INS_NODE_MAGIC_N; + + return(node); +} + +/*************************************************************** +Creates an entry template for each index of a table. */ +static +void +ins_node_create_entry_list( +/*=======================*/ + ins_node_t* node) /* in: row insert node */ +{ + dict_index_t* index; + dtuple_t* entry; + + ut_ad(node->entry_sys_heap); + + UT_LIST_INIT(node->entry_list); + + index = dict_table_get_first_index(node->table); + + while (index != NULL) { + entry = row_build_index_entry(node->row, index, + node->entry_sys_heap); + UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry); + + index = dict_table_get_next_index(index); + } +} + +/********************************************************************* +Adds system field buffers to a row. */ +static +void +row_ins_alloc_sys_fields( +/*=====================*/ + ins_node_t* node) /* in: insert node */ +{ + dtuple_t* row; + dict_table_t* table; + mem_heap_t* heap; + dict_col_t* col; + dfield_t* dfield; + ulint len; + byte* ptr; + + row = node->row; + table = node->table; + heap = node->entry_sys_heap; + + ut_ad(row && table && heap); + ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); + + /* 1. Allocate buffer for row id */ + + col = dict_table_get_sys_col(table, DATA_ROW_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN); + + node->row_id_buf = ptr; + + if (table->type == DICT_TABLE_CLUSTER_MEMBER) { + + /* 2. Fill in the dfield for mix id */ + + col = dict_table_get_sys_col(table, DATA_MIX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + len = mach_dulint_get_compressed_size(table->mix_id); + ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN); + + mach_dulint_write_compressed(ptr, table->mix_id); + dfield_set_data(dfield, ptr, len); + } + + /* 3. Allocate buffer for trx id */ + + col = dict_table_get_sys_col(table, DATA_TRX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN); + + node->trx_id_buf = ptr; + + /* 4. Allocate buffer for roll ptr */ + + col = dict_table_get_sys_col(table, DATA_ROLL_PTR); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + + dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN); +} + +/************************************************************************* +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ + +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /* in: insert node */ + dtuple_t* row) /* in: new row (or first row) for the node */ +{ + node->state = INS_NODE_SET_IX_LOCK; + node->index = NULL; + node->entry = NULL; + + node->row = row; + + mem_heap_empty(node->entry_sys_heap); + + /* Create templates for index entries */ + + ins_node_create_entry_list(node); + + /* Allocate from entry_sys_heap buffers for sys fields */ + + row_ins_alloc_sys_fields(node); + + /* As we allocated a new trx id buf, the trx id should be written + there again: */ + + node->trx_id = ut_dulint_zero; +} + +/*********************************************************************** +Does an insert operation by updating a delete-marked existing record +in the index. This situation can occur if the delete-marked record is +kept in the index for consistent reads. */ +static +ulint +row_ins_sec_index_entry_by_modify( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /* in: B-tree cursor */ + dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + big_rec_t* dummy_big_rec; + mem_heap_t* heap; + upd_t* update; + rec_t* rec; + ulint err; + + rec = btr_cur_get_rec(cursor); + + ut_ad((cursor->index->type & DICT_CLUSTERED) == 0); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); + + /* We know that in the alphabetical ordering, entry and rec are + identified. But in their binary form there may be differences if + there are char fields in them. Therefore we have to calculate the + difference. */ + + heap = mem_heap_create(1024); + + update = row_upd_build_sec_rec_difference_binary(cursor->index, + entry, rec, thr_get_trx(thr), heap); + if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor, + update, 0, thr, mtr); + if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, + &dummy_big_rec, update, 0, thr, mtr); + } + + mem_heap_free(heap); + + return(err); +} + +/*********************************************************************** +Does an insert operation by delete unmarking and updating a delete marked +existing record in the index. This situation can occur if the delete marked +record is kept in the index for consistent reads. */ +static +ulint +row_ins_clust_index_entry_by_modify( +/*================================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /* in: B-tree cursor */ + big_rec_t** big_rec,/* out: possible big rec vector of fields + which have to be stored externally by the + caller */ + dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + mem_heap_t* heap; + rec_t* rec; + upd_t* update; + ulint err; + + ut_ad(cursor->index->type & DICT_CLUSTERED); + + *big_rec = NULL; + + rec = btr_cur_get_rec(cursor); + + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); + + heap = mem_heap_create(1024); + + /* Build an update vector containing all the fields to be modified; + NOTE that this vector may NOT contain system columns trx_id or + roll_ptr */ + + update = row_upd_build_difference_binary(cursor->index, entry, ext_vec, + n_ext_vec, rec, thr_get_trx(thr), heap); + if (mode == BTR_MODIFY_LEAF) { + /* Try optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(0, cursor, update, 0, thr, + mtr); + if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update(0, cursor, big_rec, update, + 0, thr, mtr); + } + + mem_heap_free(heap); + + return(err); +} + +/************************************************************************* +Returns TRUE if in a cascaded update/delete an ancestor node of node +updates (not DELETE, but UPDATE) table. */ +static +ibool +row_ins_cascade_ancestor_updates_table( +/*===================================*/ + /* out: TRUE if an ancestor updates table */ + que_node_t* node, /* in: node in a query graph */ + dict_table_t* table) /* in: table */ +{ + que_node_t* parent; + upd_node_t* upd_node; + + parent = que_node_get_parent(node); + + while (que_node_get_type(parent) == QUE_NODE_UPDATE) { + + upd_node = parent; + + if (upd_node->table == table && upd_node->is_delete == FALSE) { + + return(TRUE); + } + + parent = que_node_get_parent(parent); + + ut_a(parent); + } + + return(FALSE); +} + +/************************************************************************* +Returns the number of ancestor UPDATE or DELETE nodes of a +cascaded update/delete node. */ +static +ulint +row_ins_cascade_n_ancestors( +/*========================*/ + /* out: number of ancestors */ + que_node_t* node) /* in: node in a query graph */ +{ + que_node_t* parent; + ulint n_ancestors = 0; + + parent = que_node_get_parent(node); + + while (que_node_get_type(parent) == QUE_NODE_UPDATE) { + n_ancestors++; + + parent = que_node_get_parent(parent); + + ut_a(parent); + } + + return(n_ancestors); +} + +/********************************************************************** +Calculates the update vector node->cascade->update for a child table in +a cascaded update. */ +static +ulint +row_ins_cascade_calc_update_vec( +/*============================*/ + /* out: number of fields in the + calculated update vector; the value + can also be 0 if no foreign key + fields changed; the returned value + is ULINT_UNDEFINED if the column + type in the child table is too short + to fit the new value in the parent + table: that means the update fails */ + upd_node_t* node, /* in: update node of the parent + table */ + dict_foreign_t* foreign, /* in: foreign key constraint whose + type is != 0 */ + mem_heap_t* heap) /* in: memory heap to use as + temporary storage */ +{ + upd_node_t* cascade = node->cascade_node; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index = foreign->foreign_index; + upd_t* update; + upd_field_t* ufield; + dict_table_t* parent_table; + dict_index_t* parent_index; + upd_t* parent_update; + upd_field_t* parent_ufield; + ulint n_fields_updated; + ulint parent_field_no; + dtype_t* type; + ulint i; + ulint j; + + ut_a(node && foreign && cascade && table && index); + + /* Calculate the appropriate update vector which will set the fields + in the child index record to the same value (possibly padded with + spaces if the column is a fixed length CHAR or FIXBINARY column) as + the referenced index record will get in the update. */ + + parent_table = node->table; + ut_a(parent_table == foreign->referenced_table); + parent_index = foreign->referenced_index; + parent_update = node->update; + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + n_fields_updated = 0; + + for (i = 0; i < foreign->n_fields; i++) { + + parent_field_no = dict_table_get_nth_col_pos( + parent_table, + dict_index_get_nth_col_no( + parent_index, i)); + + for (j = 0; j < parent_update->n_fields; j++) { + parent_ufield = parent_update->fields + j; + + if (parent_ufield->field_no == parent_field_no) { + + ulint fixed_size; + + /* A field in the parent index record is + updated. Let us make the update vector + field for the child table. */ + + ufield = update->fields + n_fields_updated; + + ufield->field_no = + dict_table_get_nth_col_pos(table, + dict_index_get_nth_col_no(index, i)); + ufield->exp = NULL; + + ufield->new_val = parent_ufield->new_val; + + type = dict_index_get_nth_type(index, i); + + /* Do not allow a NOT NULL column to be + updated as NULL */ + + if (ufield->new_val.len == UNIV_SQL_NULL + && (type->prtype & DATA_NOT_NULL)) { + + return(ULINT_UNDEFINED); + } + + /* If the new value would not fit in the + column, do not allow the update */ + + if (ufield->new_val.len != UNIV_SQL_NULL + && ufield->new_val.len + > dtype_get_len(type)) { + + return(ULINT_UNDEFINED); + } + + /* If the parent column type has a different + length than the child column type, we may + need to pad with spaces the new value of the + child column */ + + fixed_size = dtype_get_fixed_size(type); + + /* TODO: pad in UCS-2 with 0x0020. + TODO: How does the special truncation of + UTF-8 CHAR cols affect this? */ + + if (fixed_size + && ufield->new_val.len != UNIV_SQL_NULL + && ufield->new_val.len < fixed_size) { + + ufield->new_val.data = + mem_heap_alloc(heap, + fixed_size); + ufield->new_val.len = fixed_size; + ut_a(dtype_get_pad_char(type) + != ULINT_UNDEFINED); + + memset(ufield->new_val.data, + (byte)dtype_get_pad_char(type), + fixed_size); + ut_memcpy(ufield->new_val.data, + parent_ufield->new_val.data, + parent_ufield->new_val.len); + } + + ufield->extern_storage = FALSE; + + n_fields_updated++; + } + } + } + + update->n_fields = n_fields_updated; + + return(n_fields_updated); +} + +/************************************************************************* +Reports a foreign key error associated with an update or a delete of a +parent table index entry. */ +static +void +row_ins_foreign_report_err( +/*=======================*/ + const char* errstr, /* in: error string from the viewpoint + of the parent table */ + que_thr_t* thr, /* in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /* in: foreign key constraint */ + rec_t* rec, /* in: a matching index record in the + child table */ + dtuple_t* entry) /* in: index entry in the parent + table */ +{ + FILE* ef = dict_foreign_err_file; + trx_t* trx = thr_get_trx(thr); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign); + putc('\n', ef); + fputs(errstr, ef); + fputs(" in parent table, in index ", ef); + ut_print_name(ef, trx, foreign->referenced_index->name); + if (entry) { + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + } + fputs("\nBut in child table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, foreign->foreign_index->name); + if (rec) { + fputs(", there is a record:\n", ef); + rec_print(ef, rec, foreign->foreign_index); + } else { + fputs(", the record is not available\n", ef); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/************************************************************************* +Reports a foreign key error to dict_foreign_err_buf when we are trying +to add an index entry to a child table. Note that the adding may be the result +of an update, too. */ +static +void +row_ins_foreign_report_add_err( +/*===========================*/ + trx_t* trx, /* in: transaction */ + dict_foreign_t* foreign, /* in: foreign key constraint */ + rec_t* rec, /* in: a record in the parent table: + it does not match entry because we + have an error! */ + dtuple_t* entry) /* in: index entry to insert in the + child table */ +{ + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx); + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign); + fputs("\nTrying to add in child table, in index ", ef); + ut_print_name(ef, trx, foreign->foreign_index->name); + if (entry) { + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + } + fputs("\nBut in parent table ", ef); + ut_print_name(ef, trx, foreign->referenced_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, foreign->referenced_index->name); + fputs(",\nthe closest match we can find is record:\n", ef); + if (rec && page_rec_is_supremum(rec)) { + /* If the cursor ended on a supremum record, it is better + to report the previous record in the error message, so that + the user gets a more descriptive error message. */ + rec = page_rec_get_prev(rec); + } + + if (rec) { + rec_print(ef, rec, foreign->foreign_index); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/************************************************************************* +Invalidate the query cache for the given table. */ +static +void +row_ins_invalidate_query_cache( +/*===========================*/ + que_thr_t* thr, /* in: query thread whose run_node + is an update node */ + const char* name) /* in: table name prefixed with + database name and a '/' character */ +{ + char* buf; + char* ptr; + ulint len = strlen(name) + 1; + + buf = mem_strdupl(name, len); + + ptr = strchr(buf, '/'); + ut_a(ptr); + *ptr = '\0'; + + /* We call a function in ha_innodb.cc */ +#ifndef UNIV_HOTBACKUP + innobase_invalidate_query_cache(thr_get_trx(thr), buf, len); +#endif + mem_free(buf); +} + +/************************************************************************* +Perform referential actions or checks when a parent row is deleted or updated +and the constraint had an ON DELETE or ON UPDATE condition which was not +RESTRICT. */ +static +ulint +row_ins_foreign_check_on_constraint( +/*================================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + or error code */ + que_thr_t* thr, /* in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /* in: foreign key constraint whose + type is != 0 */ + btr_pcur_t* pcur, /* in: cursor placed on a matching + index record in the child table */ + dtuple_t* entry, /* in: index entry in the parent + table */ + mtr_t* mtr) /* in: mtr holding the latch of pcur + page */ +{ + upd_node_t* node; + upd_node_t* cascade; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index; + dict_index_t* clust_index; + dtuple_t* ref; + mem_heap_t* upd_vec_heap = NULL; + rec_t* rec; + rec_t* clust_rec; + upd_t* update; + ulint n_to_update; + ulint err; + ulint i; + trx_t* trx; + mem_heap_t* tmp_heap = NULL; + + ut_a(thr && foreign && pcur && mtr); + + trx = thr_get_trx(thr); + + /* Since we are going to delete or update a row, we have to invalidate + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the sync0sync.h rank above the kernel mutex. The query cache mutex has + a rank just above the kernel mutex. */ + + row_ins_invalidate_query_cache(thr, table->name); + + node = thr->run_node; + + if (node->is_delete && 0 == (foreign->type & + (DICT_FOREIGN_ON_DELETE_CASCADE + | DICT_FOREIGN_ON_DELETE_SET_NULL))) { + + row_ins_foreign_report_err("Trying to delete", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (!node->is_delete && 0 == (foreign->type & + (DICT_FOREIGN_ON_UPDATE_CASCADE + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* This is an UPDATE */ + + row_ins_foreign_report_err("Trying to update", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (node->cascade_node == NULL) { + /* Extend our query graph by creating a child to current + update node. The child is used in the cascade or set null + operation. */ + + node->cascade_heap = mem_heap_create(128); + node->cascade_node = row_create_update_node_for_mysql( + table, node->cascade_heap); + que_node_set_parent(node->cascade_node, node); + } + + /* Initialize cascade_node to do the operation we want. Note that we + use the SAME cascade node to do all foreign key operations of the + SQL DELETE: the table of the cascade node may change if there are + several child tables to the table where the delete is done! */ + + cascade = node->cascade_node; + + cascade->table = table; + + cascade->foreign = foreign; + + if (node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) { + cascade->is_delete = TRUE; + } else { + cascade->is_delete = FALSE; + + if (foreign->n_fields > cascade->update_n_fields) { + /* We have to make the update vector longer */ + + cascade->update = upd_create(foreign->n_fields, + node->cascade_heap); + cascade->update_n_fields = foreign->n_fields; + } + } + + /* We do not allow cyclic cascaded updating (DELETE is allowed, + but not UPDATE) of the same table, as this can lead to an infinite + cycle. Check that we are not updating the same table which is + already being modified in this cascade chain. We have to check + this also because the modification of the indexes of a 'parent' + table may still be incomplete, and we must avoid seeing the indexes + of the parent table in an inconsistent state! */ + + if (!cascade->is_delete + && row_ins_cascade_ancestor_updates_table(cascade, table)) { + + /* We do not know if this would break foreign key + constraints, but play safe and return an error */ + + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( +"Trying an update, possibly causing a cyclic cascaded update\n" +"in the child table,", thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (row_ins_cascade_n_ancestors(cascade) >= 15) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( +"Trying a too deep cascaded delete or update\n", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + index = btr_pcur_get_btr_cur(pcur)->index; + + ut_a(index == foreign->foreign_index); + + rec = btr_pcur_get_rec(pcur); + + if (index->type & DICT_CLUSTERED) { + /* pcur is already positioned in the clustered index of + the child table */ + + clust_index = index; + clust_rec = rec; + } else { + /* We have to look for the record in the clustered index + in the child table */ + + clust_index = dict_table_get_first_index(table); + + tmp_heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, + tmp_heap); + btr_pcur_open_with_no_init(clust_index, ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + cascade->pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(cascade->pcur); + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(cascade->pcur) + < dict_index_get_n_unique(clust_index)) { + + fputs( + "InnoDB: error in cascade of a foreign key op\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + fputs("\n" + "InnoDB: clustered record ", stderr); + rec_print(stderr, clust_rec, clust_index); + fputs("\n" +"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Set an X-lock on the row to delete or update in the child table */ + + err = lock_table(0, table, LOCK_IX, thr); + + if (err == DB_SUCCESS) { + /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; + we already have a normal shared lock on the appropriate + gap if the search criterion was not unique */ + + err = lock_clust_rec_read_check_and_lock_alt(0, clust_rec, + clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); + } + + if (err != DB_SUCCESS) { + + goto nonstandard_exit_func; + } + + if (rec_get_deleted_flag(clust_rec, table->comp)) { + /* This can happen if there is a circular reference of + rows such that cascading delete comes to delete a row + already in the process of being delete marked */ + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + if ((node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)) + || (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* Build the appropriate update vector which sets + foreign->n_fields first fields in rec to SQL NULL */ + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + for (i = 0; i < foreign->n_fields; i++) { + (update->fields + i)->field_no + = dict_table_get_nth_col_pos(table, + dict_index_get_nth_col_no(index, i)); + (update->fields + i)->exp = NULL; + (update->fields + i)->new_val.len = UNIV_SQL_NULL; + (update->fields + i)->new_val.data = NULL; + (update->fields + i)->extern_storage = FALSE; + } + } + + if (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) { + + /* Build the appropriate update vector which sets changing + foreign->n_fields first fields in rec to new values */ + + upd_vec_heap = mem_heap_create(256); + + n_to_update = row_ins_cascade_calc_update_vec(node, foreign, + upd_vec_heap); + if (n_to_update == ULINT_UNDEFINED) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( +"Trying a cascaded update where the updated value in the child\n" +"table would not fit in the length of the column, or the value would\n" +"be NULL and the column is declared as not NULL in the child table,", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (cascade->update->n_fields == 0) { + + /* The update does not change any columns referred + to in this foreign key constraint: no need to do + anything */ + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Store pcur position and initialize or store the cascade node + pcur stored position */ + + btr_pcur_store_position(pcur, mtr); + + if (index == clust_index) { + btr_pcur_copy_stored_position(cascade->pcur, pcur); + } else { + btr_pcur_store_position(cascade->pcur, mtr); + } + + mtr_commit(mtr); + + ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON); + + cascade->state = UPD_NODE_UPDATE_CLUSTERED; + + err = row_update_cascade_for_mysql(thr, cascade, + foreign->foreign_table); + + if (foreign->foreign_table->n_foreign_key_checks_running == 0) { + fprintf(stderr, +"InnoDB: error: table %s has the counter 0 though there is\n" +"InnoDB: a FOREIGN KEY check running on it.\n", + foreign->foreign_table->name); + } + + /* Release the data dictionary latch for a while, so that we do not + starve other threads from doing CREATE TABLE etc. if we have a huge + cascaded operation running. The counter n_foreign_key_checks_running + will prevent other users from dropping or ALTERing the table when we + release the latch. */ + + row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); + row_mysql_freeze_data_dictionary(thr_get_trx(thr)); + + mtr_start(mtr); + + /* Restore pcur position */ + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + return(err); + +nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + btr_pcur_store_position(pcur, mtr); + + mtr_commit(mtr); + mtr_start(mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + return(err); +} + +/************************************************************************* +Sets a shared lock on a record. Used in locking possible duplicate key +records and also in checking foreign key constraints. */ +static +ulint +row_ins_set_shared_rec_lock( +/*========================*/ + /* out: DB_SUCCESS or error code */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (index->type & DICT_CLUSTERED) { + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); + } + + return(err); +} + +/************************************************************************* +Sets a exclusive lock on a record. Used in locking possible duplicate key +records */ +static +ulint +row_ins_set_exclusive_rec_lock( +/*============================*/ + /* out: DB_SUCCESS or error code */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (index->type & DICT_CLUSTERED) { + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); + } + + return(err); +} + +/******************************************************************* +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_operation_lock. */ + +ulint +row_ins_check_foreign_constraint( +/*=============================*/ + /* out: DB_SUCCESS, + DB_NO_REFERENCED_ROW, + or DB_ROW_IS_REFERENCED */ + ibool check_ref,/* in: TRUE if we want to check that + the referenced table is ok, FALSE if we + want to to check the foreign key table */ + dict_foreign_t* foreign,/* in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /* in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /* in: index entry for index */ + que_thr_t* thr) /* in: query thread */ +{ + upd_node_t* upd_node; + dict_table_t* check_table; + dict_index_t* check_index; + ulint n_fields_cmp; + rec_t* rec; + btr_pcur_t pcur; + ibool moved; + int cmp; + ulint err; + ulint i; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + +run_again: +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + err = DB_SUCCESS; + + if (trx->check_foreigns == FALSE) { + /* The user has suppressed foreign key checks currently for + this session */ + goto exit_func; + } + + /* If any of the foreign key fields in entry is SQL NULL, we + suppress the foreign key check: this is compatible with Oracle, + for example */ + + for (i = 0; i < foreign->n_fields; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + goto exit_func; + } + } + + if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) { + upd_node = thr->run_node; + + if (!(upd_node->is_delete) && upd_node->foreign == foreign) { + /* If a cascaded update is done as defined by a + foreign key constraint, do not check that + constraint for the child row. In ON UPDATE CASCADE + the update of the parent row is only half done when + we come here: if we would check the constraint here + for the child row it would fail. + + A QUESTION remains: if in the child table there are + several constraints which refer to the same parent + table, we should merge all updates to the child as + one update? And the updates can be contradictory! + Currently we just perform the update associated + with each foreign key constraint, one after + another, and the user has problems predicting in + which order they are performed. */ + + goto exit_func; + } + } + + if (check_ref) { + check_table = foreign->referenced_table; + check_index = foreign->referenced_index; + } else { + check_table = foreign->foreign_table; + check_index = foreign->foreign_index; + } + + if (check_table == NULL || check_table->ibd_file_missing) { + if (check_ref) { + FILE* ef = dict_foreign_err_file; + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx); + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, + trx, foreign); + fputs("\nTrying to add to index ", ef); + ut_print_name(ef, trx, foreign->foreign_index->name); + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + fputs("\nBut the parent table ", ef); + ut_print_name(ef, trx, foreign->referenced_table_name); + fputs("\nor its .ibd file does not currently exist!\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_NO_REFERENCED_ROW; + } + + goto exit_func; + } + + ut_a(check_table && check_index); + + if (check_table != table) { + /* We already have a LOCK_IX on table, but not necessarily + on check_table */ + + err = lock_table(0, check_table, LOCK_IS, thr); + + if (err != DB_SUCCESS) { + + goto do_possible_lock_wait; + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, foreign->n_fields); + + btr_pcur_open(check_index, entry, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + /* Scan index records and check if there is a matching record */ + + for (;;) { + rec = btr_pcur_get_rec(&pcur); + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + goto next_rec; + } + + offsets = rec_get_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, &heap); + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, + check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + + goto next_rec; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, rec, + check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + } else { + /* Found a matching record. Lock only + a record because we can allow inserts + into gaps */ + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, rec, + check_index, offsets, thr); + + if (err != DB_SUCCESS) { + + break; + } + + if (check_ref) { + err = DB_SUCCESS; + + break; + } else if (foreign->type != 0) { + /* There is an ON UPDATE or ON DELETE + condition: check them in a separate + function */ + + err = + row_ins_foreign_check_on_constraint( + thr, foreign, &pcur, entry, + &mtr); + if (err != DB_SUCCESS) { + + break; + } + } else { + row_ins_foreign_report_err( + "Trying to delete or update", + thr, foreign, rec, entry); + + err = DB_ROW_IS_REFERENCED; + break; + } + } + } + + if (cmp < 0) { + err = row_ins_set_shared_rec_lock(LOCK_GAP, + rec, check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + + if (check_ref) { + err = DB_NO_REFERENCED_ROW; + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + } else { + err = DB_SUCCESS; + } + + break; + } + + ut_a(cmp == 0); +next_rec: + moved = btr_pcur_move_to_next(&pcur, &mtr); + + if (!moved) { + if (check_ref) { + rec = btr_pcur_get_rec(&pcur); + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + err = DB_NO_REFERENCED_ROW; + } else { + err = DB_SUCCESS; + } + + break; + } + } + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + +do_possible_lock_wait: + if (err == DB_LOCK_WAIT) { + trx->error_state = err; + + que_thr_stop_for_mysql(thr); + + srv_suspend_mysql_thread(thr); + + if (trx->error_state == DB_SUCCESS) { + + goto run_again; + } + + err = trx->error_state; + } + +exit_func: + if (heap) { + mem_heap_free(heap); + } + return(err); +} + +/******************************************************************* +Checks if foreign key constraints fail for an index entry. If index +is not mentioned in any constraint, this function does nothing, +Otherwise does searches to the indexes of referenced tables and +sets shared locks which lock either the success or the failure of +a constraint. */ +static +ulint +row_ins_check_foreign_constraints( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + dict_table_t* table, /* in: table */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry for index */ + que_thr_t* thr) /* in: query thread */ +{ + dict_foreign_t* foreign; + ulint err; + trx_t* trx; + ibool got_s_lock = FALSE; + + trx = thr_get_trx(thr); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign) { + if (foreign->foreign_index == index) { + + if (foreign->referenced_table == NULL) { + dict_table_get(foreign->referenced_table_name, + trx); + } + + if (0 == trx->dict_operation_lock_mode) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + if (foreign->referenced_table) { + mutex_enter(&(dict_sys->mutex)); + + (foreign->referenced_table + ->n_foreign_key_checks_running)++; + + mutex_exit(&(dict_sys->mutex)); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects the referenced + table from being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint(TRUE, foreign, + table, entry, thr); + + if (foreign->referenced_table) { + mutex_enter(&(dict_sys->mutex)); + + ut_a(foreign->referenced_table + ->n_foreign_key_checks_running > 0); + (foreign->referenced_table + ->n_foreign_key_checks_running)--; + + mutex_exit(&(dict_sys->mutex)); + } + + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + if (err != DB_SUCCESS) { + return(err); + } + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + return(DB_SUCCESS); +} + +/******************************************************************* +Checks if a unique key violation to rec would occur at the index entry +insert. */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + /* out: TRUE if error */ + rec_t* rec, /* in: user record; NOTE that we assume + that the caller already has a record lock on + the record! */ + dtuple_t* entry, /* in: entry to insert */ + dict_index_t* index, /* in: index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint matched_fields; + ulint matched_bytes; + ulint n_unique; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + matched_bytes = 0; + + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + /* In a unique secondary index we allow equal key values if they + contain SQL NULLs */ + + if (!(index->type & DICT_CLUSTERED)) { + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(FALSE); + } + } + } + + if (!rec_get_deleted_flag(rec, index->table->comp)) { + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************* +Scans a unique non-clustered index at a given index entry to determine +whether a uniqueness violation has occurred for the key value of the entry. +Set shared locks on possible duplicate records. */ +static +ulint +row_ins_scan_sec_index_for_duplicate( +/*=================================*/ + /* out: DB_SUCCESS, DB_DUPLICATE_KEY, or + DB_LOCK_WAIT */ + dict_index_t* index, /* in: non-clustered unique index */ + dtuple_t* entry, /* in: index entry */ + que_thr_t* thr) /* in: query thread */ +{ +#ifndef UNIV_HOTBACKUP + ulint n_unique; + ulint i; + int cmp; + ulint n_fields_cmp; + rec_t* rec; + btr_pcur_t pcur; + ulint err = DB_SUCCESS; + ibool moved; + mtr_t mtr; + trx_t* trx; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + n_unique = dict_index_get_n_unique(index); + + /* If the secondary index is unique, but one of the fields in the + n_unique first fields is NULL, a unique key violation cannot occur, + since we define NULL != NULL in this case */ + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(DB_SUCCESS); + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index)); + + btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); + + /* Scan index records and check if there is a duplicate */ + + for (;;) { + rec = btr_pcur_get_rec(&pcur); + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + goto next_rec; + } + + /* Try to place a lock on the index record */ + + trx = thr_get_trx(thr); + ut_ad(trx); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (innobase_query_is_update()) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); + } + + if (err != DB_SUCCESS) { + + break; + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + goto next_rec; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { + err = DB_DUPLICATE_KEY; + + thr_get_trx(thr)->error_info = index; + + break; + } + } + + if (cmp < 0) { + break; + } + + ut_a(cmp == 0); +next_rec: + moved = btr_pcur_move_to_next(&pcur, &mtr); + + if (!moved) { + break; + } + } + + if (heap) { + mem_heap_free(heap); + } + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + + return(err); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +/******************************************************************* +Checks if a unique key violation error would occur at an index entry +insert. Sets shared locks on possible duplicate records. Works only +for a clustered index! */ +static +ulint +row_ins_duplicate_error_in_clust( +/*=============================*/ + /* out: DB_SUCCESS if no error, + DB_DUPLICATE_KEY if error, DB_LOCK_WAIT if we + have to wait for a lock on a possible + duplicate record */ + btr_cur_t* cursor, /* in: B-tree cursor */ + dtuple_t* entry, /* in: entry to insert */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ +#ifndef UNIV_HOTBACKUP + ulint err; + rec_t* rec; + page_t* page; + ulint n_unique; + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + UT_NOT_USED(mtr); + + ut_a(cursor->index->type & DICT_CLUSTERED); + ut_ad(cursor->index->type & DICT_UNIQUE); + + /* NOTE: For unique non-clustered indexes there may be any number + of delete marked records with the same value for the non-clustered + index key (remember multiversioning), and which differ only in + the row refererence part of the index record, containing the + clustered index key fields. For such a secondary index record, + to avoid race condition, we must FIRST do the insertion and after + that check that the uniqueness condition is not breached! */ + + /* NOTE: A problem is that in the B-tree node pointers on an + upper level may match more to the entry than the actual existing + user records on the leaf level. So, even if low_match would suggest + that a duplicate key violation may occur, this may not be the case. */ + + n_unique = dict_index_get_n_unique(cursor->index); + + if (cursor->low_match >= n_unique) { + + rec = btr_cur_get_rec(cursor); + page = buf_frame_align(rec); + + if (rec != page_get_infimum_rec(page)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + /* We set a lock on the possible duplicate: this + is needed in logical logging of MySQL to make + sure that in roll-forward we get the same duplicate + errors as in original execution */ + + if (innobase_query_is_update()) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP,rec,cursor->index, + offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP,rec, cursor->index, + offsets, thr); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (row_ins_dupl_error_with_rec(rec, entry, + cursor->index, offsets)) { + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + } + } + + if (cursor->up_match >= n_unique) { + + rec = page_rec_get_next(btr_cur_get_rec(cursor)); + page = buf_frame_align(rec); + + if (rec != page_get_supremum_rec(page)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + if (innobase_query_is_update()) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (row_ins_dupl_error_with_rec(rec, entry, + cursor->index, offsets)) { + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + mem_heap_free(heap); + } + + ut_a(!(cursor->index->type & DICT_CLUSTERED)); + /* This should never happen */ + } + + err = DB_SUCCESS; +func_exit: + return(err); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +/******************************************************************* +Checks if an index entry has long enough common prefix with an existing +record so that the intended insert of the entry must be changed to a modify of +the existing record. In the case of a clustered index, the prefix must be +n_unique fields long, and in the case of a secondary index, all fields must be +equal. */ +UNIV_INLINE +ulint +row_ins_must_modify( +/*================*/ + /* out: 0 if no update, ROW_INS_PREV if + previous should be updated; currently we + do the search so that only the low_match + record can match enough to the search tuple, + not the next record */ + btr_cur_t* cursor) /* in: B-tree cursor */ +{ + ulint enough_match; + rec_t* rec; + page_t* page; + + /* NOTE: (compare to the note in row_ins_duplicate_error) Because node + pointers on upper levels of the B-tree may match more to entry than + to actual user records on the leaf level, we have to check if the + candidate record is actually a user record. In a clustered index + node pointers contain index->n_unique first fields, and in the case + of a secondary index, all fields of the index. */ + + enough_match = dict_index_get_n_unique_in_tree(cursor->index); + + if (cursor->low_match >= enough_match) { + + rec = btr_cur_get_rec(cursor); + page = buf_frame_align(rec); + + if (rec != page_get_infimum_rec(page)) { + + return(ROW_INS_PREV); + } + } + + return(0); +} + +/******************************************************************* +Tries to insert an index entry to an index. If the index is clustered +and a record with the same unique key is found, the other record is +necessarily marked deleted by a committed transaction, or a unique key +violation error occurs. The delete marked record is then updated to an +existing record, and we must write an undo log record on the delete +marked record. If the index is secondary, and a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. */ + +ulint +row_ins_index_entry_low( +/*====================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL + if pessimistic retry needed, or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ + que_thr_t* thr) /* in: query thread */ +{ + btr_cur_t cursor; + ulint ignore_sec_unique = 0; + ulint modify = 0; /* remove warning */ + rec_t* insert_rec; + rec_t* rec; + rec_t* first_rec; + ulint err; + ulint n_unique; + big_rec_t* big_rec = NULL; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + log_free_check(); + + mtr_start(&mtr); + + cursor.thr = thr; + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (!(thr_get_trx(thr)->check_unique_secondary)) { + ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE; + } + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + mode | BTR_INSERT | ignore_sec_unique, + &cursor, 0, &mtr); + + if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { + /* The insertion was made to the insert buffer already during + the search: we are done */ + + err = DB_SUCCESS; + + goto function_exit; + } + + first_rec = page_rec_get_next(page_get_infimum_rec( + buf_frame_align(btr_cur_get_rec(&cursor)))); + + if (!page_rec_is_supremum(first_rec)) { + ut_a(rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } + + n_unique = dict_index_get_n_unique(index); + + if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique + || cursor.low_match >= n_unique)) { + + if (index->type & DICT_CLUSTERED) { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust(&cursor, + entry, thr, &mtr); + if (err != DB_SUCCESS) { + + goto function_exit; + } + } else { + mtr_commit(&mtr); + err = row_ins_scan_sec_index_for_duplicate(index, + entry, thr); + mtr_start(&mtr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion. */ + + btr_cur_search_to_nth_level(index, 0, entry, + PAGE_CUR_LE, mode | BTR_INSERT, + &cursor, 0, &mtr); + } + } + + modify = row_ins_must_modify(&cursor); + + if (modify != 0) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + + if (modify == ROW_INS_NEXT) { + rec = page_rec_get_next(btr_cur_get_rec(&cursor)); + + btr_cur_position(index, rec, &cursor); + } + + if (index->type & DICT_CLUSTERED) { + err = row_ins_clust_index_entry_by_modify(mode, + &cursor, &big_rec, + entry, + ext_vec, n_ext_vec, + thr, &mtr); + } else { + err = row_ins_sec_index_entry_by_modify(mode, &cursor, + entry, + thr, &mtr); + } + + } else { + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert(0, &cursor, entry, + &insert_rec, &big_rec, thr, &mtr); + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_insert(0, &cursor, entry, + &insert_rec, &big_rec, thr, &mtr); + } + + if (err == DB_SUCCESS) { + if (ext_vec) { + rec_set_field_extern_bits(insert_rec, index, + ext_vec, n_ext_vec, &mtr); + } + } + } + +function_exit: + mtr_commit(&mtr); + + if (big_rec) { + rec_t* rec; + mtr_start(&mtr); + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields(index, rec, + offsets, big_rec, &mtr); + + if (modify) { + dtuple_big_rec_free(big_rec); + } else { + dtuple_convert_back_big_rec(index, entry, big_rec); + } + + mtr_commit(&mtr); + } + + if (heap) { + mem_heap_free(heap); + } + return(err); +} + +/******************************************************************* +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. */ + +ulint +row_ins_index_entry( +/*================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DUPLICATE_KEY, or some other error code */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + if (UT_LIST_GET_FIRST(index->table->foreign_list)) { + err = row_ins_check_foreign_constraints(index->table, index, + entry, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } + + /* Try first optimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, + ext_vec, n_ext_vec, thr); + if (err != DB_FAIL) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, + ext_vec, n_ext_vec, thr); + return(err); +} + +/*************************************************************** +Sets the values of the dtuple fields in entry from the values of appropriate +columns in row. */ +static +void +row_ins_index_entry_set_vals( +/*=========================*/ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to make */ + dtuple_t* row) /* in: row */ +{ + dict_field_t* ind_field; + dfield_t* field; + dfield_t* row_field; + ulint n_fields; + ulint i; + dtype_t* cur_type; + + ut_ad(entry && row); + + n_fields = dtuple_get_n_fields(entry); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(entry, i); + ind_field = dict_index_get_nth_field(index, i); + + row_field = dtuple_get_nth_field(row, ind_field->col->ind); + + /* Check column prefix indexes */ + if (ind_field->prefix_len > 0 + && dfield_get_len(row_field) != UNIV_SQL_NULL) { + + cur_type = dict_col_get_type( + dict_field_get_col(ind_field)); + + field->len = dtype_get_at_most_n_mbchars(cur_type, + ind_field->prefix_len, + dfield_get_len(row_field), row_field->data); + } else { + field->len = row_field->len; + } + + field->data = row_field->data; + } +} + +/*************************************************************** +Inserts a single index entry to the table. */ +static +ulint +row_ins_index_entry_step( +/*=====================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + ins_node_t* node, /* in: row insert node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(dtuple_check_typed(node->row)); + + row_ins_index_entry_set_vals(node->index, node->entry, node->row); + + ut_ad(dtuple_check_typed(node->entry)); + + err = row_ins_index_entry(node->index, node->entry, NULL, 0, thr); + + return(err); +} + +/*************************************************************** +Allocates a row id for row and inits the node->index field. */ +UNIV_INLINE +void +row_ins_alloc_row_id_step( +/*======================*/ + ins_node_t* node) /* in: row insert node */ +{ + dulint row_id; + + ut_ad(node->state == INS_NODE_ALLOC_ROW_ID); + + if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) { + + /* No row id is stored if the clustered index is unique */ + + return; + } + + /* Fill in row id value to row */ + + row_id = dict_sys_get_new_row_id(); + + dict_sys_write_row_id(node->row_id_buf, row_id); +} + +/*************************************************************** +Gets a row to insert from the values list. */ +UNIV_INLINE +void +row_ins_get_row_from_values( +/*========================*/ + ins_node_t* node) /* in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->values_list; + + while (list_node) { + eval_exp(list_node); + + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/*************************************************************** +Gets a row to insert from the select list. */ +UNIV_INLINE +void +row_ins_get_row_from_select( +/*========================*/ + ins_node_t* node) /* in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->select->select_list; + + while (list_node) { + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/*************************************************************** +Inserts a row to a table. */ + +ulint +row_ins( +/*====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + ins_node_t* node, /* in: row insert node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + + if (node->state == INS_NODE_ALLOC_ROW_ID) { + + row_ins_alloc_row_id_step(node); + + node->index = dict_table_get_first_index(node->table); + node->entry = UT_LIST_GET_FIRST(node->entry_list); + + if (node->ins_type == INS_SEARCHED) { + + row_ins_get_row_from_select(node); + + } else if (node->ins_type == INS_VALUES) { + + row_ins_get_row_from_values(node); + } + + node->state = INS_NODE_INSERT_ENTRIES; + } + + ut_ad(node->state == INS_NODE_INSERT_ENTRIES); + + while (node->index != NULL) { + err = row_ins_index_entry_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + } + + ut_ad(node->entry == NULL); + + node->state = INS_NODE_ALLOC_ROW_ID; + + return(DB_SUCCESS); +} + +/*************************************************************** +Inserts a row to a table. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_ins_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ins_node_t* node; + que_node_t* parent; + sel_node_t* sel_node; + trx_t* trx; + ulint err; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started(trx); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); + + parent = que_node_get_parent(node); + sel_node = node->select; + + if (thr->prev_node == parent) { + node->state = INS_NODE_SET_IX_LOCK; + } + + /* If this is the first time this node is executed (or when + execution resumes after wait for the table IX lock), set an + IX lock on the table and reset the possible select node. */ + + if (node->state == INS_NODE_SET_IX_LOCK) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + if (UT_DULINT_EQ(trx->id, node->trx_id)) { + /* No need to do IX-locking or write trx id to buf */ + + goto same_trx; + } + + trx_write_trx_id(node->trx_id_buf, trx->id); + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + + node->trx_id = trx->id; + same_trx: + node->state = INS_NODE_ALLOC_ROW_ID; + + if (node->ins_type == INS_SEARCHED) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to insert */ + + thr->run_node = sel_node; + + return(thr); + } + } + + if ((node->ins_type == INS_SEARCHED) + && (sel_node->state != SEL_NODE_FETCH)) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to insert */ + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_ins(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* err == DB_LOCK_WAIT or SQL error detected */ + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->ins_type == INS_SEARCHED) { + /* Fetch a row to insert */ + + thr->run_node = sel_node; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c new file mode 100644 index 00000000000..7f78a5b723b --- /dev/null +++ b/storage/innobase/row/row0mysql.c @@ -0,0 +1,4085 @@ +/****************************************************** +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +(c) 2000 Innobase Oy + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#include "row0ins.h" +#include "row0sel.h" +#include "row0upd.h" +#include "row0row.h" +#include "que0que.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "dict0crea.h" +#include "dict0load.h" +#include "dict0boot.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "lock0lock.h" +#include "rem0cmp.h" +#include "log0log.h" +#include "btr0sea.h" +#include "fil0fil.h" +#include "ibuf0ibuf.h" + +/* A dummy variable used to fool the compiler */ +ibool row_mysql_identically_false = FALSE; + +/* List of tables we should drop in background. ALTER TABLE in MySQL requires +that the table handler can drop the table in background when there are no +queries to it any more. Protected by the kernel mutex. */ +typedef struct row_mysql_drop_struct row_mysql_drop_t; +struct row_mysql_drop_struct{ + char* table_name; + UT_LIST_NODE_T(row_mysql_drop_t) row_mysql_drop_list; +}; + +UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list; +ibool row_mysql_drop_list_inited = FALSE; + +/* Magic table names for invoking various monitor threads */ +static const char S_innodb_monitor[] = "innodb_monitor"; +static const char S_innodb_lock_monitor[] = "innodb_lock_monitor"; +static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor"; +static const char S_innodb_table_monitor[] = "innodb_table_monitor"; +static const char S_innodb_mem_validate[] = "innodb_mem_validate"; + +/* Name suffix for recovered orphaned temporary tables */ +static const char S_recover_innodb_tmp_table[] = "_recover_innodb_tmp_table"; +/*********************************************************************** +Determine if the given name ends in the suffix reserved for recovered +orphaned temporary tables. */ +static +ibool +row_mysql_is_recovered_tmp_table( +/*=============================*/ + /* out: TRUE if table name ends in + the reserved suffix */ + const char* name) +{ + ulint namelen = strlen(name) + 1; + return(namelen >= sizeof S_recover_innodb_tmp_table + && !memcmp(name + namelen - + sizeof S_recover_innodb_tmp_table, + S_recover_innodb_tmp_table, + sizeof S_recover_innodb_tmp_table)); +} + +/*********************************************************************** +Determine if the given name is a name reserved for MySQL system tables. */ +static +ibool +row_mysql_is_system_table( +/*======================*/ + /* out: TRUE if name is a MySQL + system table name */ + const char* name) +{ + if (memcmp(name, "mysql/", 6)) { + return(FALSE); + } + return(0 == strcmp(name + 6, "host") + || 0 == strcmp(name + 6, "user") + || 0 == strcmp(name + 6, "db")); +} + +/*********************************************************************** +Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */ +static +void +row_mysql_delay_if_needed(void) +/*===========================*/ +{ + if (srv_dml_needed_delay) { + os_thread_sleep(srv_dml_needed_delay); + } +} + +/*********************************************************************** +Frees the blob heap in prebuilt when no longer needed. */ + +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct of a + ha_innobase:: table handle */ +{ + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; +} + +/*********************************************************************** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. */ + +byte* +row_mysql_store_true_var_len( +/*=========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + byte* dest, /* in: where to store */ + ulint len, /* in: length, must fit in two bytes */ + ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + ut_a(len < 256 * 256); + + mach_write_to_2_little_endian(dest, len); + + return(dest + 2); + } + + ut_a(lenlen == 1); + ut_a(len < 256); + + mach_write_to_1(dest, len); + + return(dest + 1); +} + +/*********************************************************************** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. */ + +byte* +row_mysql_read_true_varchar( +/*========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + ulint* len, /* out: variable-length field length */ + byte* field, /* in: field in the MySQL format */ + ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + *len = mach_read_from_2_little_endian(field); + + return(field + 2); + } + + ut_a(lenlen == 1); + + *len = mach_read_from_1(field); + + return(field + 1); +} + +/*********************************************************************** +Stores a reference to a BLOB in the MySQL format. */ + +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /* in: where to store */ + ulint col_len, /* in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + byte* data, /* in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len) /* in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +{ + /* MySQL might assume the field is set to zero except the length and + the pointer fields */ + + memset(dest, '\0', col_len); + + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + ut_a(col_len - 8 > 1 || len < 256); + ut_a(col_len - 8 > 2 || len < 256 * 256); + ut_a(col_len - 8 > 3 || len < 256 * 256 * 256); + + mach_write_to_n_little_endian(dest, col_len - 8, len); + + ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*)); +} + +/*********************************************************************** +Reads a reference to a BLOB in the MySQL format. */ + +byte* +row_mysql_read_blob_ref( +/*====================*/ + /* out: pointer to BLOB data */ + ulint* len, /* out: BLOB length */ + byte* ref, /* in: BLOB reference in the MySQL format */ + ulint col_len) /* in: BLOB reference length (not BLOB + length) */ +{ + byte* data; + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + ut_memcpy((byte*)&data, ref + col_len - 8, sizeof(byte*)); + + return(data); +} + +/****************************************************************** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.c. */ + +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + /* out: up to which byte we used + buf in the conversion */ + dfield_t* dfield, /* in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /* in/out: buffer for a converted + integer value; this must be at least + col_len long then! */ + ibool row_format_col, /* TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + byte* mysql_data, /* in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /* in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ibool comp) /* in: TRUE = compact format */ +{ + byte* ptr = mysql_data; + dtype_t* dtype; + ulint type; + ulint lenlen; + + dtype = dfield_get_type(dfield); + + type = dtype->mtype; + + if (type == DATA_INT) { + /* Store integer data in Innobase in a big-endian format, + sign bit negated if the data is a signed integer. In MySQL, + integers are stored in a little-endian format. */ + + ptr = buf + col_len; + + for (;;) { + ptr--; + *ptr = *mysql_data; + if (ptr == buf) { + break; + } + mysql_data++; + } + + if (!(dtype->prtype & DATA_UNSIGNED)) { + + *ptr = (byte) (*ptr ^ 128); + } + + buf += col_len; + } else if ((type == DATA_VARCHAR + || type == DATA_VARMYSQL + || type == DATA_BINARY)) { + + if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) { + /* The length of the actual data is stored to 1 or 2 + bytes at the start of the field */ + + if (row_format_col) { + if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) { + lenlen = 2; + } else { + lenlen = 1; + } + } else { + /* In a MySQL key value, lenlen is always 2 */ + lenlen = 2; + } + + ptr = row_mysql_read_true_varchar(&col_len, mysql_data, + lenlen); + } else { + /* Remove trailing spaces from old style VARCHAR + columns. */ + + /* Handle UCS2 strings differently. */ + ulint mbminlen = dtype_get_mbminlen(dtype); + + ptr = mysql_data; + + if (mbminlen == 2) { + /* space=0x0020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~1; + + while (col_len >= 2 && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 2; + } + } else { + ut_a(mbminlen == 1); + /* space=0x20 */ + while (col_len > 0 + && ptr[col_len - 1] == 0x20) { + col_len--; + } + } + } + } else if (comp && type == DATA_MYSQL + && dtype_get_mbminlen(dtype) == 1 + && dtype_get_mbmaxlen(dtype) > 1) { + /* In some cases we strip trailing spaces from UTF-8 and other + multibyte charsets, from FIXED-length CHAR columns, to save + space. UTF-8 would otherwise normally use 3 * the string length + bytes to store a latin1 string! */ + + /* We assume that this CHAR field is encoded in a + variable-length character set where spaces have + 1:1 correspondence to 0x20 bytes, such as UTF-8. + + Consider a CHAR(n) field, a field of n characters. + It will contain between n * mbminlen and n * mbmaxlen bytes. + We will try to truncate it to n bytes by stripping + space padding. If the field contains single-byte + characters only, it will be truncated to n characters. + Consider a CHAR(5) field containing the string ".a " + where "." denotes a 3-byte character represented by + the bytes "$%&". After our stripping, the string will + be stored as "$%&a " (5 bytes). The string ".abc " + will be stored as "$%&abc" (6 bytes). + + The space padding will be restored in row0sel.c, function + row_sel_field_store_in_mysql_format(). */ + + ulint n_chars; + + ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars && ptr[col_len - 1] == 0x20) { + col_len--; + } + } else if (type == DATA_BLOB && row_format_col) { + + ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); + } + + dfield_set_data(dfield, ptr, col_len); + + return(buf); +} + +/****************************************************************** +Convert a row in the MySQL format to a row in the Innobase format. Note that +the function to convert a MySQL format key value to an InnoDB dtuple is +row_sel_convert_mysql_key_to_innobase() in row0sel.c. */ +static +void +row_mysql_convert_row_to_innobase( +/*==============================*/ + dtuple_t* row, /* in/out: Innobase row where the + field type information is already + copied there! */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct where template + must be of type ROW_MYSQL_WHOLE_ROW */ + byte* mysql_rec) /* in: row in the MySQL format; + NOTE: do not discard as long as + row is used, as row may contain + pointers to this record! */ +{ + mysql_row_templ_t* templ; + dfield_t* dfield; + ulint i; + + ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(prebuilt->mysql_template); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + dfield = dtuple_get_nth_field(row, i); + + if (templ->mysql_null_bit_mask != 0) { + /* Column may be SQL NULL */ + + if (mysql_rec[templ->mysql_null_byte_offset] & + (byte) (templ->mysql_null_bit_mask)) { + + /* It is SQL NULL */ + + dfield_set_data(dfield, NULL, UNIV_SQL_NULL); + + goto next_column; + } + } + + row_mysql_store_col_in_innobase_format(dfield, + prebuilt->ins_upd_rec_buff + + templ->mysql_col_offset, + TRUE, /* MySQL row format data */ + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, + prebuilt->table->comp); +next_column: + ; + } +} + +/******************************************************************** +Handles user errors and lock waits detected by the database engine. */ + +ibool +row_mysql_handle_errors( +/*====================*/ + /* out: TRUE if it was a lock wait and + we should continue running the query thread */ + ulint* new_err,/* out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /* in: transaction */ + que_thr_t* thr, /* in: query thread */ + trx_savept_t* savept) /* in: savepoint or NULL */ +{ +#ifndef UNIV_HOTBACKUP + ulint err; + +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + if (err == DB_DUPLICATE_KEY) { + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + } else if (err == DB_TOO_BIG_RECORD) { + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + /* MySQL will roll back the latest SQL statement */ + } else if (err == DB_ROW_IS_REFERENCED + || err == DB_NO_REFERENCED_ROW + || err == DB_CANNOT_ADD_CONSTRAINT) { + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + /* MySQL will roll back the latest SQL statement */ + } else if (err == DB_LOCK_WAIT) { + + srv_suspend_mysql_thread(thr); + + if (trx->error_state != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(TRUE); + + } else if (err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT + || err == DB_LOCK_TABLE_FULL) { + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + } else if (err == DB_OUT_OF_FILE_SPACE) { + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + /* MySQL will roll back the latest SQL statement */ + + } else if (err == DB_MUST_GET_MORE_FILE_SPACE) { + + fputs( + "InnoDB: The database cannot continue operation because of\n" + "InnoDB: lack of space. You must add a new data file to\n" + "InnoDB: my.cnf and restart the database.\n", stderr); + + exit(1); + } else if (err == DB_CORRUPTION) { + + fputs( + "InnoDB: We detected index corruption in an InnoDB type table.\n" + "InnoDB: You have to dump + drop + reimport the table or, in\n" + "InnoDB: a case of widespread corruption, dump all InnoDB\n" + "InnoDB: tables and recreate the whole InnoDB tablespace.\n" + "InnoDB: If the mysqld server crashes after the startup or when\n" + "InnoDB: you dump the tables, look at\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/Forcing_recovery.html" + " for help.\n", stderr); + + } else { + fprintf(stderr, "InnoDB: unknown error code %lu\n", + (ulong) err); + ut_error; + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(FALSE); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +/************************************************************************ +Create a prebuilt struct for a MySQL table handle. */ + +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + /* out, own: a prebuilt struct */ + dict_table_t* table) /* in: Innobase table handle */ +{ + row_prebuilt_t* prebuilt; + mem_heap_t* heap; + dict_index_t* clust_index; + dtuple_t* ref; + ulint ref_len; + ulint i; + + heap = mem_heap_create(128); + + prebuilt = mem_heap_alloc(heap, sizeof(row_prebuilt_t)); + + prebuilt->magic_n = ROW_PREBUILT_ALLOCATED; + prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED; + + prebuilt->table = table; + + prebuilt->trx = NULL; + + prebuilt->sql_stat_start = TRUE; + + prebuilt->mysql_has_locked = FALSE; + + prebuilt->index = NULL; + + prebuilt->used_in_HANDLER = FALSE; + + prebuilt->n_template = 0; + prebuilt->mysql_template = NULL; + + prebuilt->heap = heap; + prebuilt->ins_node = NULL; + + prebuilt->ins_upd_rec_buff = NULL; + + prebuilt->upd_node = NULL; + prebuilt->ins_graph = NULL; + prebuilt->upd_graph = NULL; + + prebuilt->pcur = btr_pcur_create_for_mysql(); + prebuilt->clust_pcur = btr_pcur_create_for_mysql(); + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = 99999999; + + prebuilt->sel_graph = NULL; + + prebuilt->search_tuple = dtuple_create(heap, + 2 * dict_table_get_n_cols(table)); + + clust_index = dict_table_get_first_index(table); + + /* Make sure that search_tuple is long enough for clustered index */ + ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + prebuilt->clust_ref = ref; + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + prebuilt->fetch_cache[i] = NULL; + } + + prebuilt->n_fetch_cached = 0; + + prebuilt->blob_heap = NULL; + + prebuilt->old_vers_heap = NULL; + + return(prebuilt); +} + +/************************************************************************ +Free a prebuilt struct for a MySQL table handle. */ + +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt) /* in, own: prebuilt struct */ +{ + ulint i; + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED + || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, +"InnoDB: Error: trying to free a corrupt\n" +"InnoDB: table handle. Magic n %lu, magic n2 %lu, table name", + (ulong) prebuilt->magic_n, + (ulong) prebuilt->magic_n2); + ut_print_name(stderr, NULL, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption((byte*)prebuilt); + + ut_error; + } + + prebuilt->magic_n = ROW_PREBUILT_FREED; + prebuilt->magic_n2 = ROW_PREBUILT_FREED; + + btr_pcur_free_for_mysql(prebuilt->pcur); + btr_pcur_free_for_mysql(prebuilt->clust_pcur); + + if (prebuilt->mysql_template) { + mem_free(prebuilt->mysql_template); + } + + if (prebuilt->ins_graph) { + que_graph_free_recursive(prebuilt->ins_graph); + } + + if (prebuilt->sel_graph) { + que_graph_free_recursive(prebuilt->sel_graph); + } + + if (prebuilt->upd_graph) { + que_graph_free_recursive(prebuilt->upd_graph); + } + + if (prebuilt->blob_heap) { + mem_heap_free(prebuilt->blob_heap); + } + + if (prebuilt->old_vers_heap) { + mem_heap_free(prebuilt->old_vers_heap); + } + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + if (prebuilt->fetch_cache[i] != NULL) { + + if ((ROW_PREBUILT_FETCH_MAGIC_N != + mach_read_from_4((prebuilt->fetch_cache[i]) - 4)) + || (ROW_PREBUILT_FETCH_MAGIC_N != + mach_read_from_4((prebuilt->fetch_cache[i]) + + prebuilt->mysql_row_len))) { + fputs( + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: fetch buffer.\n", stderr); + + mem_analyze_corruption( + prebuilt->fetch_cache[i]); + + ut_error; + } + + mem_free((prebuilt->fetch_cache[i]) - 4); + } + } + + dict_table_decrement_handle_count(prebuilt->table); + + mem_heap_free(prebuilt->heap); +} + +/************************************************************************* +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ + +void +row_update_prebuilt_trx( +/*====================*/ + /* out: prebuilt dtuple */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + trx_t* trx) /* in: transaction handle */ +{ + if (trx->magic_n != TRX_MAGIC_N) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: trx handle. Magic n %lu\n", + (ulong) trx->magic_n); + + mem_analyze_corruption((byte*)trx); + + ut_error; + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, NULL, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption((byte*)prebuilt); + + ut_error; + } + + prebuilt->trx = trx; + + if (prebuilt->ins_graph) { + prebuilt->ins_graph->trx = trx; + } + + if (prebuilt->upd_graph) { + prebuilt->upd_graph->trx = trx; + } + + if (prebuilt->sel_graph) { + prebuilt->sel_graph->trx = trx; + } +} + +/************************************************************************* +Gets pointer to a prebuilt dtuple used in insertions. If the insert graph +has not yet been built in the prebuilt struct, then this function first +builds it. */ +static +dtuple_t* +row_get_prebuilt_insert_row( +/*========================*/ + /* out: prebuilt dtuple; the column + type information is also set in it */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + ins_node_t* node; + dtuple_t* row; + dict_table_t* table = prebuilt->table; + ulint i; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->ins_node == NULL) { + + /* Not called before for this handle: create an insert node + and query graph to the prebuilt struct */ + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == NULL) { + prebuilt->ins_upd_rec_buff = mem_heap_alloc( + prebuilt->heap, + prebuilt->mysql_row_len); + } + + row = dtuple_create(prebuilt->heap, + dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + /* We init the value of every field to the SQL NULL to avoid + a debug assertion from failing */ + + for (i = 0; i < dtuple_get_n_fields(row); i++) { + + dtuple_get_nth_field(row, i)->len = UNIV_SQL_NULL; + } + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = + que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->ins_node->row); +} + +/************************************************************************* +Updates the table modification counter and calculates new estimates +for table and index statistics if necessary. */ +UNIV_INLINE +void +row_update_statistics_if_needed( +/*============================*/ + dict_table_t* table) /* in: table */ +{ + ulint counter; + + counter = table->stat_modified_counter; + + table->stat_modified_counter = counter + 1; + + /* Calculate new statistics if 1 / 16 of table has been modified + since the last time a statistics batch was run, or if + stat_modified_counter > 2 000 000 000 (to avoid wrap-around). + We calculate statistics at most every 16th round, since we may have + a counter table which is very small and updated very often. */ + + if (counter > 2000000000 + || ((ib_longlong)counter > 16 + table->stat_n_rows / 16)) { + + dict_update_statistics(table); + } +} + +/************************************************************************* +Unlocks an AUTO_INC type lock possibly reserved by trx. */ + +void +row_unlock_table_autoinc_for_mysql( +/*===============================*/ + trx_t* trx) /* in: transaction */ +{ + if (!trx->auto_inc_lock) { + + return; + } + + lock_table_unlock_auto_inc(trx); +} + +/************************************************************************* +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. */ + +int +row_lock_table_autoinc_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in the MySQL + table handle */ +{ + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (trx->auto_inc_lock) { + + return(DB_SUCCESS); + } + + trx->op_info = "setting auto-inc lock"; + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + /* We use the insert query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Unlocks all table locks explicitly requested by trx (with LOCK TABLES, +lock type LOCK_TABLE_EXP). */ + +void +row_unlock_tables_for_mysql( +/*========================*/ + trx_t* trx) /* in: transaction */ +{ + if (!trx->n_lock_table_exp) { + + return; + } + + mutex_enter(&kernel_mutex); + lock_release_tables_off_kernel(trx); + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +Sets a table lock on the table mentioned in prebuilt. */ + +int +row_lock_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /* in: table to lock, or NULL + if prebuilt->table should be + locked or a + prebuilt->select_lock_type */ + ulint mode) /* in: lock mode of table */ +{ + trx_t* trx = prebuilt->trx; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "setting table lock"; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + if (table) { + err = lock_table(0, table, mode, thr); + } else { + if (mode == LOCK_TABLE_TRANSACTIONAL) { + err = lock_table(LOCK_TABLE_TRANSACTIONAL, + prebuilt->table, + prebuilt->select_lock_type, thr); + } else { + err = lock_table(LOCK_TABLE_EXP, prebuilt->table, + prebuilt->select_lock_type, thr); + } + } + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Does an insert for MySQL. */ + +int +row_insert_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" +"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n" +"InnoDB: table %s does not exist.\n" +"InnoDB: Have you deleted the .ibd file from the database directory under\n" +"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n" +"InnoDB: Look from\n" +"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" +"InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, prebuilt->trx, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption((byte*)prebuilt); + + ut_error; + } + + if (srv_created_new_raw || srv_force_recovery) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "inserting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started(trx); + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec); + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + if (prebuilt->sql_stat_start) { + node->state = INS_NODE_SET_IX_LOCK; + prebuilt->sql_stat_start = FALSE; + } else { + node->state = INS_NODE_ALLOC_ROW_ID; + } + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + +/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW; + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + prebuilt->table->stat_n_rows++; + + srv_n_rows_inserted++; + + if (prebuilt->table->stat_n_rows == 0) { + /* Avoid wrap-over */ + prebuilt->table->stat_n_rows--; + } + + row_update_statistics_if_needed(prebuilt->table); + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Builds a dummy query graph used in selects. */ + +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + sel_node_t* node; + + ut_ad(prebuilt && prebuilt->trx); + + if (prebuilt->sel_graph == NULL) { + + node = sel_node_create(prebuilt->heap); + + prebuilt->sel_graph = + que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + + prebuilt->sel_graph->state = QUE_FORK_ACTIVE; + } +} + +/************************************************************************* +Creates an query graph node of 'update' type to be used in the MySQL +interface. */ + +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + /* out, own: update node */ + dict_table_t* table, /* in: table to update */ + mem_heap_t* heap) /* in: mem heap from which allocated */ +{ + upd_node_t* node; + + node = upd_node_create(heap); + + node->in_mysql_interface = TRUE; + node->is_delete = FALSE; + node->searched_update = FALSE; + node->select_will_do_update = FALSE; + node->select = NULL; + node->pcur = btr_pcur_create_for_mysql(); + node->table = table; + + node->update = upd_create(dict_table_get_n_cols(table), heap); + + node->update_n_fields = dict_table_get_n_cols(table); + + UT_LIST_INIT(node->columns); + node->has_clust_rec_x_lock = TRUE; + node->cmpl_info = 0; + + node->table_sym = NULL; + node->col_assign_list = NULL; + + return(node); +} + +/************************************************************************* +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. */ + +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + /* out: prebuilt update vector */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + upd_node_t* node; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->upd_node == NULL) { + + /* Not called before for this handle: create an update node + and query graph to the prebuilt struct */ + + node = row_create_update_node_for_mysql(table, prebuilt->heap); + + prebuilt->upd_node = node; + + prebuilt->upd_graph = + que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->upd_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->upd_node->update); +} + +/************************************************************************* +Does an update or delete of a row for MySQL. */ + +int +row_update_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + ulint err; + que_thr_t* thr; + ibool was_lock_wait; + dict_index_t* clust_index; +/* ulint ref_len; */ + upd_node_t* node; + dict_table_t* table = prebuilt->table; + trx_t* trx = prebuilt->trx; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + UT_NOT_USED(mysql_rec); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" +"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n" +"InnoDB: table %s does not exist.\n" +"InnoDB: Have you deleted the .ibd file from the database directory under\n" +"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n" +"InnoDB: Look from\n" +"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" +"InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, prebuilt->trx, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption((byte*)prebuilt); + + ut_error; + } + + if (srv_created_new_raw || srv_force_recovery) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "updating or deleting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started(trx); + + node = prebuilt->upd_node; + + clust_index = dict_table_get_first_index(table); + + if (prebuilt->pcur->btr_cur.index == clust_index) { + btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur); + } else { + btr_pcur_copy_stored_position(node->pcur, + prebuilt->clust_pcur); + } + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + /* MySQL seems to call rnd_pos before updating each row it + has cached: we can get the correct cursor position from + prebuilt->pcur; NOTE that we cannot build the row reference + from mysql_rec if the clustered index was automatically + generated for the table: MySQL does not know anything about + the row id used as the clustered index key */ + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->upd_graph); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + ut_ad(!prebuilt->sql_stat_start); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + + return((int) err); + } + + thr->lock_state= QUE_THR_LOCK_ROW; + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK;; + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (node->is_delete) { + if (prebuilt->table->stat_n_rows > 0) { + prebuilt->table->stat_n_rows--; + } + + srv_n_rows_deleted++; + } else { + srv_n_rows_updated++; + } + + row_update_statistics_if_needed(prebuilt->table); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Does an unlock of a row for MySQL. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + rec_t* rec; + btr_pcur_t* cur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + mtr_t mtr; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "unlock_row"; + + if (srv_locks_unsafe_for_binlog) { + if (trx->trx_create_lock == TRUE) { + + mtr_start(&mtr); + + /* Restore a cursor position and find a record */ + btr_pcur_restore_position(BTR_SEARCH_LEAF, cur, &mtr); + rec = btr_pcur_get_rec(cur); + + if (rec) { + + lock_rec_reset_and_release_wait(rec); + } else { + fputs("InnoDB: Error: " + "Record for the lock not found\n", + stderr); + mem_analyze_corruption((byte*) trx); + ut_error; + } + + trx->trx_create_lock = FALSE; + mtr_commit(&mtr); + } + + } + + trx->op_info = ""; + + return(DB_SUCCESS); +} + +/************************************************************************** +Does a cascaded delete or set null in a foreign key operation. */ + +ulint +row_update_cascade_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + que_thr_t* thr, /* in: query thread */ + upd_node_t* node, /* in: update node used in the cascade + or set null operation */ + dict_table_t* table) /* in: table where we do the operation */ +{ + ulint err; + trx_t* trx; + + trx = thr_get_trx(thr); +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + /* Note that the cascade node is a subnode of another InnoDB + query graph node. We do a normal lock wait in this node, but + all errors are handled by the parent node. */ + + if (err == DB_LOCK_WAIT) { + /* Handle lock wait here */ + + que_thr_stop_for_mysql(thr); + + srv_suspend_mysql_thread(thr); + + /* Note that a lock wait may also end in a lock wait timeout, + or this transaction is picked as a victim in selective + deadlock resolution */ + + if (trx->error_state != DB_SUCCESS) { + + return(trx->error_state); + } + + /* Retry operation after a normal lock wait */ + + goto run_again; + } + + if (err != DB_SUCCESS) { + + return(err); + } + + if (node->is_delete) { + if (table->stat_n_rows > 0) { + table->stat_n_rows--; + } + + srv_n_rows_deleted++; + } else { + srv_n_rows_updated++; + } + + row_update_statistics_if_needed(table); + + return(err); +} + +/************************************************************************* +Checks if a table is such that we automatically created a clustered +index on it (on row id). */ + +ibool +row_table_got_default_clust_index( +/*==============================*/ + dict_table_t* table) +{ + dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(table); + + if (dtype_get_mtype(dict_index_get_nth_type(clust_index, 0)) + == DATA_SYS) { + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Calculates the key number used inside MySQL for an Innobase index. We have +to take into account if we generated a default clustered index for the table */ + +ulint +row_get_mysql_key_number_for_index( +/*===============================*/ + dict_index_t* index) +{ + dict_index_t* ind; + ulint i; + + ut_a(index); + + i = 0; + ind = dict_table_get_first_index(index->table); + + while (index != ind) { + ind = dict_table_get_next_index(ind); + i++; + } + + if (row_table_got_default_clust_index(index->table)) { + ut_a(i > 0); + i--; + } + + return(i); +} + +/************************************************************************* +Recovers an orphaned tmp table inside InnoDB by renaming it. In the table +name #sql becomes rsql, and "_recover_innodb_tmp_table" is catenated to +the end of name. table->name should be of the form +"dbname/rsql..._recover_innodb_tmp_table". This renames a table whose +name is "#sql..." */ +static +int +row_mysql_recover_tmp_table( +/*========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table definition */ + trx_t* trx) /* in: transaction handle */ +{ + const char* ptr = strstr(table->name, "/rsql"); + + if (!ptr) { + /* table name does not begin with "/rsql" */ + trx_commit_for_mysql(trx); + return(DB_ERROR); + } + else { + int status; + int namelen = (int) strlen(table->name); + char* old_name = mem_strdupl(table->name, namelen); + /* replace "rsql" with "#sql" */ + old_name[ptr - table->name + 1] = '#'; + /* remove "_recover_innodb_tmp_table" suffix */ + ut_ad(namelen > (int) sizeof S_recover_innodb_tmp_table); + ut_ad(!strcmp(old_name + namelen + 1 - + sizeof S_recover_innodb_tmp_table, + S_recover_innodb_tmp_table)); + old_name[namelen + 1 - sizeof S_recover_innodb_tmp_table] = 0; + status = row_rename_table_for_mysql(old_name, + table->name, trx); + mem_free(old_name); + return(status); + } +} + +/************************************************************************* +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ + +void +row_mysql_freeze_data_dictionary( +/*=============================*/ + trx_t* trx) /* in: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == 0); + + rw_lock_s_lock(&dict_operation_lock); + + trx->dict_operation_lock_mode = RW_S_LATCH; +} + +/************************************************************************* +Unlocks the data dictionary shared lock. */ + +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx) /* in: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); + + rw_lock_s_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/************************************************************************* +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ + +void +row_mysql_lock_data_dictionary( +/*===========================*/ + trx_t* trx) /* in: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == 0 + || trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks or lock waits can occur then in these operations */ + + rw_lock_x_lock(&dict_operation_lock); + trx->dict_operation_lock_mode = RW_X_LATCH; + + mutex_enter(&(dict_sys->mutex)); +} + +/************************************************************************* +Unlocks the data dictionary exclusive lock. */ + +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx) /* in: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_exit(&(dict_sys->mutex)); + rw_lock_x_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/************************************************************************* +Does a table creation operation for MySQL. If the name of the table +to be created is equal with one of the predefined magic table names, +then this also starts printing the corresponding monitor output by +the master thread. */ + +int +row_create_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table definition */ + trx_t* trx) /* in: transaction handle */ +{ + tab_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + const char* table_name; + ulint table_name_len; + ulint err; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + + if (srv_created_new_raw) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + trx_commit_for_mysql(trx); + + return(DB_ERROR); + } + + trx->op_info = "creating table"; + + if (row_mysql_is_system_table(table->name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL system table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be of the MyISAM type!\n", + table->name); + + trx_commit_for_mysql(trx); + + return(DB_ERROR); + } + + trx_start_if_not_started(trx); + + if (row_mysql_is_recovered_tmp_table(table->name)) { + + /* MySQL prevents accessing of tables whose name begins + with #sql, that is temporary tables. If mysqld crashes in + the middle of an ALTER TABLE, we may get an orphaned + #sql-table in the tablespace. We have here a special + mechanism to recover such tables by renaming them to + rsql... */ + + return(row_mysql_recover_tmp_table(table, trx)); + } + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = strchr(table->name, '/'); + ut_a(table_name); + table_name++; + table_name_len = strlen(table_name) + 1; + + if (table_name_len == sizeof S_innodb_monitor + && !memcmp(table_name, S_innodb_monitor, + sizeof S_innodb_monitor)) { + + /* Table equals "innodb_monitor": + start monitor prints */ + + srv_print_innodb_monitor = TRUE; + + /* The lock timeout monitor thread also takes care + of InnoDB monitor prints */ + + os_event_set(srv_lock_timeout_thread_event); + } else if (table_name_len == sizeof S_innodb_lock_monitor + && !memcmp(table_name, S_innodb_lock_monitor, + sizeof S_innodb_lock_monitor)) { + + srv_print_innodb_monitor = TRUE; + srv_print_innodb_lock_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (table_name_len == sizeof S_innodb_tablespace_monitor + && !memcmp(table_name, S_innodb_tablespace_monitor, + sizeof S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (table_name_len == sizeof S_innodb_table_monitor + && !memcmp(table_name, S_innodb_table_monitor, + sizeof S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (table_name_len == sizeof S_innodb_mem_validate + && !memcmp(table_name, S_innodb_mem_validate, + sizeof S_innodb_mem_validate)) { + /* We define here a debugging feature intended for + developers */ + + fputs("Validating InnoDB memory:\n" + "to use this feature you must compile InnoDB with\n" + "UNIV_MEM_DEBUG defined in univ.i and the server must be\n" + "quiet because allocation from a mem heap is not protected\n" + "by any semaphore.\n", stderr); +#ifdef UNIV_MEM_DEBUG + ut_a(mem_validate()); + fputs("Memory validated\n", stderr); +#else /* UNIV_MEM_DEBUG */ + fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n", + stderr); +#endif /* UNIV_MEM_DEBUG */ + } + + heap = mem_heap_create(512); + + trx->dict_operation = TRUE; + + node = tab_create_graph_create(table, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + if (err == DB_OUT_OF_FILE_SPACE) { + fputs("InnoDB: Warning: cannot create table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs(" because tablespace full\n", stderr); + row_drop_table_for_mysql(table->name, trx, FALSE); + + } else if (err == DB_DUPLICATE_KEY) { + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs(" already exists in InnoDB internal\n" + "InnoDB: data dictionary. Have you deleted the .frm file\n" + "InnoDB: and not used DROP TABLE? Have you used DROP DATABASE\n" + "InnoDB: for InnoDB tables in MySQL version <= 3.23.43?\n" + "InnoDB: See the Restrictions section of the InnoDB manual.\n" + "InnoDB: You can drop the orphaned table inside InnoDB by\n" + "InnoDB: creating an InnoDB table with the same name in another\n" + "InnoDB: database and copying the .frm file to the current database.\n" + "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n" + "InnoDB: succeed.\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n", stderr); + } + + /* We may also get err == DB_ERROR if the .ibd file for the + table already exists */ + + trx->error_state = DB_SUCCESS; + } + + que_graph_free((que_t*) que_node_get_parent(thr)); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. */ + +int +row_create_index_for_mysql( +/*=======================*/ + /* out: error number or DB_SUCCESS */ + dict_index_t* index, /* in: index definition */ + trx_t* trx) /* in: transaction handle */ +{ + ind_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + ulint i, j; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "creating index"; + + trx_start_if_not_started(trx); + + /* Check that the same column does not appear twice in the index. + Starting from 4.0.14, InnoDB should be able to cope with that, but + safer not to allow them. */ + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + for (j = 0; j < i; j++) { + if (0 == ut_strcmp( + dict_index_get_nth_field(index, j)->name, + dict_index_get_nth_field(index, i)->name)) { + + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: column ", stderr); + ut_print_name(stderr, trx, + dict_index_get_nth_field(index, i)->name); + fputs(" appears twice in ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: This is not allowed in InnoDB.\n", + stderr); + + err = DB_COL_APPEARS_TWICE_IN_INDEX; + + goto error_handling; + } + } + + /* Check also that prefix_len < DICT_MAX_COL_PREFIX_LEN */ + + if (dict_index_get_nth_field(index, i)->prefix_len + >= DICT_MAX_COL_PREFIX_LEN) { + err = DB_TOO_BIG_RECORD; + + goto error_handling; + } + } + + if (row_mysql_is_recovered_tmp_table(index->table_name)) { + + return(DB_SUCCESS); + } + + heap = mem_heap_create(512); + + trx->dict_operation = TRUE; + + /* Note that the space id where we store the index is inherited from + the table in dict_build_index_def_step() in dict0crea.c. */ + + node = ind_create_graph_create(index, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + +error_handling: + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(index->table_name, trx, FALSE); + + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. Check also that foreign key +constraints which reference this table are ok. */ + +int +row_table_add_foreign_constraints( +/*==============================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + const char* name) /* in: table full name in the + normalized form + database_name/table_name */ +{ + ulint err; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(sql_string); + + trx->op_info = "adding foreign keys"; + + trx_start_if_not_started(trx); + + if (row_mysql_is_recovered_tmp_table(name)) { + + return(DB_SUCCESS); + } + + trx->dict_operation = TRUE; + + err = dict_create_foreign_constraints(trx, sql_string, name); + + if (err == DB_SUCCESS) { + /* Check that also referencing constraints are ok */ + err = dict_load_foreigns(name, trx->check_foreigns); + } + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(name, trx, FALSE); + + trx->error_state = DB_SUCCESS; + } + + return((int) err); +} + +/************************************************************************* +Drops a table for MySQL as a background operation. MySQL relies on Unix +in ALTER TABLE to the fact that the table handler does not remove the +table before all handles to it has been removed. Furhermore, the MySQL's +call to drop table must be non-blocking. Therefore we do the drop table +as a background operation, which is taken care of by the master thread +in srv0srv.c. */ +static +int +row_drop_table_for_mysql_in_background( +/*===================================*/ + /* out: error code or DB_SUCCESS */ + const char* name) /* in: table name */ +{ + ulint error; + trx_t* trx; + + trx = trx_allocate_for_background(); + + /* If the original transaction was dropping a table referenced by + foreign keys, we must set the following to be able to drop the + table: */ + + trx->check_foreigns = FALSE; + +/* fputs("InnoDB: Error: Dropping table ", stderr); + ut_print_name(stderr, name); + fputs(" in background drop list\n", stderr); */ + + /* Try to drop the table in InnoDB */ + + error = row_drop_table_for_mysql(name, trx, FALSE); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + trx_commit_for_mysql(trx); + + trx_free_for_background(trx); + + return((int) error); +} + +/************************************************************************* +The master thread in srv0srv.c calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. */ + +ulint +row_drop_tables_for_mysql_in_background(void) +/*=========================================*/ + /* out: how many tables dropped + + remaining tables in list */ +{ + row_mysql_drop_t* drop; + dict_table_t* table; + ulint n_tables; + ulint n_tables_dropped = 0; +loop: + mutex_enter(&kernel_mutex); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + n_tables = UT_LIST_GET_LEN(row_mysql_drop_list); + + mutex_exit(&kernel_mutex); + + if (drop == NULL) { + /* All tables dropped */ + + return(n_tables + n_tables_dropped); + } + + mutex_enter(&(dict_sys->mutex)); + table = dict_table_get_low(drop->table_name); + mutex_exit(&(dict_sys->mutex)); + + if (table == NULL) { + /* If for some reason the table has already been dropped + through some other mechanism, do not try to drop it */ + + goto already_dropped; + } + + if (DB_SUCCESS != row_drop_table_for_mysql_in_background( + drop->table_name)) { + /* If the DROP fails for some table, we return, and let the + main thread retry later */ + + return(n_tables + n_tables_dropped); + } + + n_tables_dropped++; + +already_dropped: + mutex_enter(&kernel_mutex); + + UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Dropped table %s in background drop queue.\n", + drop->table_name); + + mem_free(drop->table_name); + + mem_free(drop); + + mutex_exit(&kernel_mutex); + + goto loop; +} + +/************************************************************************* +Get the background drop list length. NOTE: the caller must own the kernel +mutex! */ + +ulint +row_get_background_drop_list_len_low(void) +/*======================================*/ + /* out: how many tables in list */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + return(UT_LIST_GET_LEN(row_mysql_drop_list)); +} + +/************************************************************************* +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + /* out: TRUE if the table was not yet in the + drop list, and was added there */ + dict_table_t* table) /* in: table */ +{ + row_mysql_drop_t* drop; + + mutex_enter(&kernel_mutex); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + /* Look if the table already is in the drop list */ + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + while (drop != NULL) { + if (strcmp(drop->table_name, table->name) == 0) { + /* Already in the list */ + + mutex_exit(&kernel_mutex); + + return(FALSE); + } + + drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop); + } + + drop = mem_alloc(sizeof(row_mysql_drop_t)); + + drop->table_name = mem_strdup(table->name); + + UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop); + +/* fputs("InnoDB: Adding table ", stderr); + ut_print_name(stderr, drop->table_name); + fputs(" to background drop list\n", stderr); */ + + mutex_exit(&kernel_mutex); + + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. */ + +int +row_discard_tablespace_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_foreign_t* foreign; + dulint new_id; + dict_table_t* table; + que_thr_t* thr; + que_t* graph = NULL; + ibool success; + ulint err; + char* buf; + +/* How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do DISCARD TABLESPACE. Then there are no running +queries on the table. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: we remove all entries for the tablespace in the insert +buffer tree; as long as the tablespace mem object does not exist, ongoing +insert buffer page merges are discarded in buf0rea.c. If we recreate the +tablespace mem object with IMPORT TABLESPACE later, then the tablespace will +have the same id, but the tablespace_version field in the mem object is +different, and ongoing old insert buffer page merges get discarded. +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. +5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we +do not allow the discard. We also reserve the data dictionary latch. */ + + static const char discard_tablespace_proc1[] = + "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "new_id CHAR;\n" + "new_id_low INT;\n" + "new_id_high INT;\n" + "table_name CHAR;\n" + "BEGIN\n" + "table_name := '"; + static const char discard_tablespace_proc2[] = + "';\n" + "new_id_high := %lu;\n" + "new_id_low := %lu;\n" + "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n" + "SELECT ID INTO old_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = table_name;\n" + "IF (SQL %% NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "UPDATE SYS_TABLES SET ID = new_id\n" + "WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "discarding tablespace"; + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, name); + fputs("\n" +"InnoDB: is in the system tablespace 0 which cannot be discarded\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (table->n_foreign_key_checks_running > 0) { + + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to DISCARD table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" + "InnoDB: though there is a foreign key check running on it.\n" + "InnoDB: Cannot discard the table.\n", + stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + + FILE* ef = dict_foreign_err_file; + + /* We only allow discarding a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot DISCARD table ", ef); + ut_print_name(ef, trx, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + buf = mem_alloc((sizeof discard_tablespace_proc1) + + (sizeof discard_tablespace_proc2) + + 20 + ut_strlenq(name, '\'')); + + memcpy(buf, discard_tablespace_proc1, sizeof discard_tablespace_proc1); + sprintf(ut_strcpyq(buf + (sizeof discard_tablespace_proc1 - 1), + '\'', name), + discard_tablespace_proc2, + (ulong) ut_dulint_get_high(new_id), + (ulong) ut_dulint_get_low(new_id)); + + graph = pars_sql(buf); + + ut_a(graph); + + /* Remove any locks there are on the table or its records */ + + lock_reset_all_on_table(table); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + dict_table_change_id_in_cache(table, new_id); + + success = fil_discard_tablespace(table->space); + + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + + err = DB_ERROR; + } else { + /* Set the flag which tells that now it is legal to + IMPORT a tablespace for this table */ + table->tablespace_discarded = TRUE; + table->ibd_file_missing = TRUE; + } + } +funct_exit: + row_mysql_unlock_data_dictionary(trx); + + if (graph) { + que_graph_free(graph); + } + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + + return((int) err); +} + +/********************************************************************* +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. */ + +int +row_import_tablespace_for_mysql( +/*============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + ibool success; + dulint current_lsn; + ulint err = DB_SUCCESS; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx_start_if_not_started(trx); + + trx->op_info = "importing tablespace"; + + current_lsn = log_get_lsn(); + + /* It is possible, though very improbable, that the lsn's in the + tablespace to be imported have risen above the current system lsn, if + a lengthy purge, ibuf merge, or rollback was performed on a backup + taken with ibbackup. If that is the case, reset page lsn's in the + file. We assume that mysqld was shut down after it performed these + cleanup operations on the .ibd file, so that it stamped the latest lsn + to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file. + + TODO: reset also the trx id's in clustered index records and write + a new space id to each data page. That would allow us to import clean + .ibd files from another MySQL installation. */ + + success = fil_reset_too_high_lsns(name, current_lsn); + + if (!success) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr); + ut_print_name(stderr, trx, name); + fputs("\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", stderr); + + err = DB_ERROR; + + row_mysql_lock_data_dictionary(trx); + + goto funct_exit; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + ut_print_timestamp(stderr); + fputs(" InnoDB: table ", stderr); + ut_print_name(stderr, trx, name); + fputs("\n" +"InnoDB: does not exist in the InnoDB data dictionary\n" +"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, name); + fputs("\n" +"InnoDB: is in the system tablespace 0 which cannot be imported\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (!table->tablespace_discarded) { + ut_print_timestamp(stderr); + fputs( +" InnoDB: Error: you are trying to IMPORT a tablespace\n" +"InnoDB: ", stderr); + ut_print_name(stderr, trx, name); + fputs(", though you have not called DISCARD on it yet\n" +"InnoDB: during the lifetime of the mysqld process!\n", stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + success = fil_open_single_table_tablespace(TRUE, table->space, + table->name); + if (success) { + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; + } else { + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fputs( +" InnoDB: cannot find or open in the database directory the .ibd file of\n" +"InnoDB: table ", stderr); + ut_print_name(stderr, trx, name); + fputs("\n" +"InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + } + + err = DB_ERROR; + } + +funct_exit: + row_mysql_unlock_data_dictionary(trx); + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Truncates a table for MySQL. */ + +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx) /* in: transaction handle */ +{ + dict_foreign_t* foreign; + ulint err; + mem_heap_t* heap; + byte* buf; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_index; + btr_pcur_t pcur; + mtr_t mtr; + dulint new_id; + char* sql; + que_thr_t* thr; + que_t* graph = NULL; + +/* How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do TRUNCATE TABLE. Then there are no running +queries on the table. This is guaranteed, because in +ha_innobase::store_lock(), we do not weaken the TL_WRITE lock requested +by MySQL when executing SQLCOM_TRUNCATE. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, so we do not +have to remove insert buffer records, as the insert buffer works at a low +level. If a freed page is later reallocated, the allocator will remove +the ibuf entries for it. + +TODO: when we truncate *.ibd files (analogous to DISCARD TABLESPACE), we +will have to remove we remove all entries for the table in the insert +buffer tree! + +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. (This will only be relevant for TRUNCATE TABLE +by DISCARD TABLESPACE.) +5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we +do not allow the TRUNCATE. We also reserve the data dictionary latch. */ + + static const char renumber_tablespace_proc[] = + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "new_id CHAR;\n" + "old_id_low INT;\n" + "old_id_high INT;\n" + "new_id_low INT;\n" + "new_id_high INT;\n" + "BEGIN\n" + "old_id_high := %lu;\n" + "old_id_low := %lu;\n" + "new_id_high := %lu;\n" + "new_id_low := %lu;\n" + "old_id := CONCAT(TO_BINARY(old_id_high, 4), TO_BINARY(old_id_low, 4));\n" + "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n" + "UPDATE SYS_TABLES SET ID = new_id\n" + "WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(table); + + if (srv_created_new_raw) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "truncating table"; + + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + ut_a(trx->dict_operation_lock_mode == 0); + /* Prevent foreign key checks etc. while we are truncating the + table */ + + row_mysql_lock_data_dictionary(trx); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; + + /* We only allow truncating a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot truncate table ", ef); + ut_print_name(ef, trx, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because it is referenced by ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_ERROR; + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been truncated here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot truncate table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs(" by DROP+CREATE\n" +"InnoDB: because there is a foreign key check running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* Remove any locks there are on the table or its records */ + + lock_reset_all_on_table(table); + + trx->table_id = table->id; + + /* scan SYS_INDEXES for all indexes of the table */ + heap = mem_heap_create(800); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + sys_index = dict_table_get_first_index(dict_sys->sys_indexes); + dict_index_copy_types(tuple, sys_index, 1); + + mtr_start(&mtr); + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + for (;;) { + rec_t* rec; + const byte* field; + ulint len; + ulint root_page_no; + + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + /* The end of SYS_INDEXES has been reached. */ + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + if (memcmp(buf, field, len) != 0) { + /* End of indexes for the table (TABLE_ID mismatch). */ + break; + } + + if (rec_get_deleted_flag(rec, FALSE)) { + /* The index has been dropped. */ + goto next_rec; + } + + btr_pcur_store_position(&pcur, &mtr); + + /* This call may commit and restart mtr. */ + root_page_no = dict_truncate_index_tree(table, rec, &mtr); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (root_page_no != FIL_NULL) { + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, + root_page_no, &mtr); + /* We will need to commit and restart the + mini-transaction in order to avoid deadlocks. + The dict_truncate_index_tree() call has allocated + a page in this mini-transaction, and the rest of + this loop could latch another index page. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, + &pcur, &mtr); + } + + next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + mem_heap_empty(heap); + sql = mem_heap_alloc(heap, (sizeof renumber_tablespace_proc) + 40); + sprintf(sql, renumber_tablespace_proc, + (ulong) ut_dulint_get_high(table->id), + (ulong) ut_dulint_get_low(table->id), + (ulong) ut_dulint_get_high(new_id), + (ulong) ut_dulint_get_low(new_id)); + + graph = pars_sql(sql); + + ut_a(graph); + + mem_heap_free(heap); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + thr = que_fork_start_command(graph); + ut_a(thr); + + que_run_threads(thr); + + que_graph_free(graph); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); +fputs(" InnoDB: Unable to assign a new identifier to table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: after truncating it. Background processes may corrupt the table!\n", + stderr); + err = DB_ERROR; + } else { + dict_table_change_id_in_cache(table, new_id); + } + + dict_table_autoinc_initialize(table, 0); + dict_update_statistics(table); + + trx_commit_for_mysql(trx); + +funct_exit: + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + srv_wake_master_thread(); + + return((int) err); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************* +Drops a table for MySQL. If the name of the table to be dropped is equal +with one of the predefined magic table names, then this also stops printing +the corresponding monitor output by the master thread. */ + +int +row_drop_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx, /* in: transaction handle */ + ibool drop_db)/* in: TRUE=dropping whole database */ +{ + dict_foreign_t* foreign; + dict_table_t* table; + ulint space_id; + que_thr_t* thr; + que_t* graph; + ulint err; + const char* table_name; + ulint namelen; + char* dir_path_of_temp_table = NULL; + ibool success; + ibool locked_dictionary = FALSE; + char* quoted_name; + char* sql; + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + static const char str1[] = + "PROCEDURE DROP_TABLE_PROC () IS\n" + "table_name CHAR;\n" + "sys_foreign_id CHAR;\n" + "table_id CHAR;\n" + "index_id CHAR;\n" + "foreign_id CHAR;\n" + "found INT;\n" + "BEGIN\n" + "table_name := "; + static const char str2[] = + ";\n" + "SELECT ID INTO table_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = table_name;\n" + "IF (SQL % NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "found := 1;\n" + "SELECT ID INTO sys_foreign_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = 'SYS_FOREIGN';\n" + "IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (table_name = 'SYS_FOREIGN') THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (table_name = 'SYS_FOREIGN_COLS') THEN\n" + " found := 0;\n" + "END IF;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = table_name\n" + " AND TO_BINARY(FOR_NAME) = TO_BINARY(table_name);\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE" + " DELETE FROM SYS_FOREIGN_COLS WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO index_id\n" + " FROM SYS_INDEXES\n" + " WHERE TABLE_ID = table_id;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n" + " DELETE FROM SYS_INDEXES WHERE ID = index_id\n" + " AND TABLE_ID = table_id;\n" + " END IF;\n" + "END LOOP;\n" + "DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n" + "DELETE FROM SYS_TABLES WHERE ID = table_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + ut_a(name != NULL); + + if (srv_created_new_raw) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "dropping table"; + + trx_start_if_not_started(trx); + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = strchr(name, '/'); + ut_a(table_name); + table_name++; + namelen = strlen(table_name) + 1; + + if (namelen == sizeof S_innodb_monitor + && !memcmp(table_name, S_innodb_monitor, + sizeof S_innodb_monitor)) { + + /* Table name equals "innodb_monitor": + stop monitor prints */ + + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_lock_monitor + && !memcmp(table_name, S_innodb_lock_monitor, + sizeof S_innodb_lock_monitor)) { + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_tablespace_monitor + && !memcmp(table_name, S_innodb_tablespace_monitor, + sizeof S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = FALSE; + } else if (namelen == sizeof S_innodb_table_monitor + && !memcmp(table_name, S_innodb_table_monitor, + sizeof S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = FALSE; + } + + quoted_name = mem_strdupq(name, '\''); + namelen = strlen(quoted_name); + sql = mem_alloc((sizeof str1) + (sizeof str2) - 2 + 1 + namelen); + memcpy(sql, str1, (sizeof str1) - 1); + memcpy(sql + (sizeof str1) - 1, quoted_name, namelen); + memcpy(sql + (sizeof str1) - 1 + namelen, str2, sizeof str2); + mem_free(quoted_name); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + /* Prevent foreign key checks etc. while we are dropping the + table */ + + row_mysql_lock_data_dictionary(trx); + + locked_dictionary = TRUE; + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + graph = pars_sql(sql); + + ut_a(graph); + mem_free(sql); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is trying to drop it.\n" + "InnoDB: Have you copied the .frm file of the table to the\n" + "InnoDB: MySQL database directory from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n", stderr); + goto funct_exit; + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns && + !(drop_db && dict_tables_have_same_db( + name, foreign->foreign_table_name))) { + FILE* ef = dict_foreign_err_file; + + /* We only allow dropping a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot drop table ", ef); + ut_print_name(ef, trx, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + + if (table->n_mysql_handles_opened > 0) { + ibool added; + + added = row_add_table_to_background_drop_list(table); + + if (added) { + ut_print_timestamp(stderr); +fputs(" InnoDB: Warning: MySQL is trying to drop table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: though there are still open handles to it.\n" +"InnoDB: Adding the table to the background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been dropped here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + + ibool added; + + added = row_add_table_to_background_drop_list(table); + + if (added) { + ut_print_timestamp(stderr); +fputs(" InnoDB: You are trying to drop table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: though there is a foreign key check running on it.\n" +"InnoDB: Adding the table to the background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* Remove any locks there are on the table or its records */ + + lock_reset_all_on_table(table); + + trx->dict_operation = TRUE; + trx->table_id = table->id; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + ut_a(err == DB_OUT_OF_FILE_SPACE); + + err = DB_MUST_GET_MORE_FILE_SPACE; + + row_mysql_handle_errors(&err, trx, thr, NULL); + + ut_error; + } else { + ibool is_path; + const char* name_or_path; + + space_id = table->space; + + if (table->dir_path_of_temp_table != NULL) { + dir_path_of_temp_table = + mem_strdup(table->dir_path_of_temp_table); + is_path = TRUE; + name_or_path = dir_path_of_temp_table; + } else { + is_path = FALSE; + name_or_path = name; + } + + dict_table_remove_from_cache(table); + + if (dict_load_table(name) != NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: not able to remove table ", + stderr); + ut_print_name(stderr, trx, name); + fputs(" from the dictionary cache!\n", stderr); + err = DB_ERROR; + } + + /* Do not drop possible .ibd tablespace if something went + wrong: we do not want to delete valuable data of the user */ + + if (err == DB_SUCCESS && space_id > 0) { + if (!fil_space_for_table_exists_in_mem(space_id, + name_or_path, + is_path, + FALSE, TRUE)) { + err = DB_SUCCESS; + + fprintf(stderr, +"InnoDB: We removed now the InnoDB internal data dictionary entry\n" +"InnoDB: of table "); + ut_print_name(stderr, trx, name); + fprintf(stderr, ".\n"); + + goto funct_exit; + } + + success = fil_delete_tablespace(space_id); + + if (!success) { + fprintf(stderr, +"InnoDB: We removed now the InnoDB internal data dictionary entry\n" +"InnoDB: of table "); + ut_print_name(stderr, trx, name); + fprintf(stderr, ".\n"); + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: not able to delete tablespace %lu of table ", + (ulong) space_id); + ut_print_name(stderr, trx, name); + fputs("!\n", stderr); + err = DB_ERROR; + } + } + } +funct_exit: + + if (locked_dictionary) { + row_mysql_unlock_data_dictionary(trx); + } + + if (dir_path_of_temp_table) { + mem_free(dir_path_of_temp_table); + } + + que_graph_free(graph); + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + +#ifndef UNIV_HOTBACKUP + srv_wake_master_thread(); +#endif /* !UNIV_HOTBACKUP */ + + return((int) err); +} + +/************************************************************************* +Drops a database for MySQL. */ + +int +row_drop_database_for_mysql( +/*========================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: database name which ends to '/' */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + char* table_name; + int err = DB_SUCCESS; + ulint namelen = strlen(name); + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(name != NULL); + ut_a(name[namelen - 1] == '/'); + + trx->op_info = "dropping database"; + + trx_start_if_not_started(trx); +loop: + row_mysql_lock_data_dictionary(trx); + + while ((table_name = dict_get_first_table_name_in_db(name))) { + ut_a(memcmp(table_name, name, namelen) == 0); + + table = dict_table_get_low(table_name); + + ut_a(table); + + /* Wait until MySQL does not have any queries running on + the table */ + + if (table->n_mysql_handles_opened > 0) { + row_mysql_unlock_data_dictionary(trx); + + ut_print_timestamp(stderr); + fputs( + " InnoDB: Warning: MySQL is trying to drop database ", stderr); + ut_print_name(stderr, trx, name); + fputs("\n" + "InnoDB: though there are still open handles to table ", stderr); + ut_print_name(stderr, trx, table_name); + fputs(".\n", stderr); + + os_thread_sleep(1000000); + + mem_free(table_name); + + goto loop; + } + + err = row_drop_table_for_mysql(table_name, trx, TRUE); + + mem_free(table_name); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, name); + fprintf(stderr, " failed with error %lu for table ", + (ulint) err); + ut_print_name(stderr, trx, table_name); + putc('\n', stderr); + break; + } + } + + row_mysql_unlock_data_dictionary(trx); + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + + return(err); +} + +/************************************************************************* +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. */ +static +ibool +row_is_mysql_tmp_table_name( +/*========================*/ + /* out: TRUE if temporary table */ + const char* name) /* in: table name in the form + 'database/tablename' */ +{ + return(strstr(name, "/#sql") != NULL); +} + +/************************************************************************* +Renames a table for MySQL. */ + +int +row_rename_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + const char* old_name, /* in: old table name */ + const char* new_name, /* in: new table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + que_thr_t* thr; + que_t* graph = NULL; + ulint err; + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + static const char str1[] = + "PROCEDURE RENAME_TABLE_PROC () IS\n" + "new_table_name CHAR;\n" + "old_table_name CHAR;\n" + "gen_constr_prefix CHAR;\n" + "new_db_name CHAR;\n" + "foreign_id CHAR;\n" + "new_foreign_id CHAR;\n" + "old_db_name_len INT;\n" + "old_t_name_len INT;\n" + "new_db_name_len INT;\n" + "id_len INT;\n" + "found INT;\n" + "BEGIN\n" + "new_table_name := '"; + static const char str2[] = + "';\nold_table_name := '"; + static const char str3[] = + "';\n" + "UPDATE SYS_TABLES SET NAME = new_table_name\n" + "WHERE NAME = old_table_name;\n"; + static const char str4a1[] = /* drop some constraints of tmp tables */ + "DELETE FROM SYS_FOREIGN_COLS WHERE ID = '"; + static const char str4a2[] = "';\n" + "DELETE FROM SYS_FOREIGN WHERE ID = '"; + static const char str4a3[] = "';\n"; + static const char str4b[] = /* rename all constraints */ + "found := 1;\n" + "old_db_name_len := INSTR(old_table_name, '/') - 1;\n" + "new_db_name_len := INSTR(new_table_name, '/') - 1;\n" + "new_db_name := SUBSTR(new_table_name, 0, new_db_name_len);\n" + "old_t_name_len := LENGTH(old_table_name);\n" + "gen_constr_prefix := CONCAT(old_table_name, '_ibfk_');\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = old_table_name\n" + " AND TO_BINARY(FOR_NAME) = TO_BINARY(old_table_name);\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " UPDATE SYS_FOREIGN\n" + " SET FOR_NAME = new_table_name\n" + " WHERE ID = foreign_id;\n" + " id_len := LENGTH(foreign_id);\n" + " IF (INSTR(foreign_id, '/') > 0) THEN\n" + " IF (INSTR(foreign_id,\n" + " gen_constr_prefix) > 0)\n" + " THEN\n" + " new_foreign_id :=\n" + " CONCAT(new_table_name,\n" + " SUBSTR(foreign_id, old_t_name_len,\n" + " id_len - old_t_name_len));\n" + " ELSE\n" + " new_foreign_id :=\n" + " CONCAT(new_db_name,\n" + " SUBSTR(foreign_id,\n" + " old_db_name_len,\n" + " id_len - old_db_name_len));\n" + " END IF;\n" + " UPDATE SYS_FOREIGN\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " UPDATE SYS_FOREIGN_COLS\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_FOREIGN SET REF_NAME = new_table_name\n" + "WHERE REF_NAME = old_table_name\n" + " AND TO_BINARY(REF_NAME) = TO_BINARY(old_table_name);\n"; + static const char str5[] = + "END;\n"; + + mem_heap_t* heap = NULL; + const char** constraints_to_drop = NULL; + ulint n_constraints_to_drop = 0; + ibool recovering_temp_table = FALSE; + ulint len; + ulint i; + ibool success; + /* length of database name; 0 if not renaming to a temporary table */ + ulint db_name_len; + char* sql; + char* sqlend; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(old_name != NULL); + ut_a(new_name != NULL); + + if (srv_created_new_raw || srv_force_recovery) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + trx_commit_for_mysql(trx); + return(DB_ERROR); + } + + if (row_mysql_is_system_table(new_name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL system table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be of the MyISAM type!\n", + new_name); + + trx_commit_for_mysql(trx); + return(DB_ERROR); + } + + trx->op_info = "renaming table"; + trx_start_if_not_started(trx); + + if (row_mysql_is_recovered_tmp_table(new_name)) { + + recovering_temp_table = TRUE; + } else { + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + } + + table = dict_table_get_low(old_name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, old_name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is trying to rename the table.\n" + "InnoDB: Have you copied the .frm file of the table to the\n" + "InnoDB: MySQL database directory from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n", stderr); + goto funct_exit; + } + + if (table->ibd_file_missing) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, old_name); + fputs( + " does not have an .ibd file in the database directory.\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n", stderr); + goto funct_exit; + } + + /* calculate the length of the SQL string */ + len = (sizeof str1) + (sizeof str2) + (sizeof str3) + (sizeof str5) - 4 + + ut_strlenq(new_name, '\'') + ut_strlenq(old_name, '\''); + + if (row_is_mysql_tmp_table_name(new_name)) { + db_name_len = dict_get_db_name_len(old_name) + 1; + + /* MySQL is doing an ALTER TABLE command and it renames the + original table to a temporary table name. We want to preserve + the original foreign key constraint definitions despite the + name change. An exception is those constraints for which + the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/ + + heap = mem_heap_create(100); + + err = dict_foreign_parse_drop_constraints(heap, trx, + table, + &n_constraints_to_drop, + &constraints_to_drop); + if (err != DB_SUCCESS) { + + goto funct_exit; + } + + /* reserve space for all database names */ + len += 2 * n_constraints_to_drop + * (ut_strlenq(old_name, '\'') + - ut_strlenq(old_name + db_name_len, '\'')); + + for (i = 0; i < n_constraints_to_drop; i++) { + ulint addlen + = 2 * ut_strlenq(constraints_to_drop[i], '\'') + + ((sizeof str4a1) + (sizeof str4a2) + + (sizeof str4a3) - 3); + if (!strchr(constraints_to_drop[i], '/')) { + addlen *= 2; + } + len += addlen; + } + } else { + db_name_len = 0; + len += (sizeof str4b) - 1; + } + + sql = sqlend = mem_alloc(len + 1); + memcpy(sql, str1, (sizeof str1) - 1); + sqlend += (sizeof str1) - 1; + sqlend = ut_strcpyq(sqlend, '\'', new_name); + memcpy(sqlend, str2, (sizeof str2) - 1); + sqlend += (sizeof str2) - 1; + sqlend = ut_strcpyq(sqlend, '\'', old_name); + memcpy(sqlend, str3, (sizeof str3) - 1); + sqlend += (sizeof str3) - 1; + + if (db_name_len) { + /* Internally, old format < 4.0.18 constraints have as the + constraint id <number>_<number>, while new format constraints + have <databasename>/<constraintname>. */ + + for (i = 0; i < n_constraints_to_drop; i++) { + memcpy(sqlend, str4a1, (sizeof str4a1) - 1); + sqlend += (sizeof str4a1) - 1; + sqlend = ut_memcpyq(sqlend, '\'', + old_name, db_name_len); + sqlend = ut_strcpyq(sqlend, '\'', + constraints_to_drop[i]); + memcpy(sqlend, str4a2, (sizeof str4a2) - 1); + sqlend += (sizeof str4a2) - 1; + sqlend = ut_memcpyq(sqlend, '\'', + old_name, db_name_len); + sqlend = ut_strcpyq(sqlend, '\'', + constraints_to_drop[i]); + memcpy(sqlend, str4a3, (sizeof str4a3) - 1); + sqlend += (sizeof str4a3) - 1; + + if (!strchr(constraints_to_drop[i], '/')) { + /* If this happens to be an old format + constraint, let us delete it. Since all new + format constraints contain '/', it does no + harm to run these DELETEs anyway. */ + + memcpy(sqlend, str4a1, (sizeof str4a1) - 1); + sqlend += (sizeof str4a1) - 1; + sqlend = ut_strcpyq(sqlend, '\'', + constraints_to_drop[i]); + memcpy(sqlend, str4a2, (sizeof str4a2) - 1); + sqlend += (sizeof str4a2) - 1; + sqlend = ut_strcpyq(sqlend, '\'', + constraints_to_drop[i]); + memcpy(sqlend, str4a3, (sizeof str4a3) - 1); + sqlend += (sizeof str4a3) - 1; + } + } + } + else { + memcpy(sqlend, str4b, (sizeof str4b) - 1); + sqlend += (sizeof str4b) - 1; + } + + memcpy(sqlend, str5, sizeof str5); + sqlend += sizeof str5; + + ut_a(sqlend == sql + len + 1); + + graph = pars_sql(sql); + + ut_a(graph); + mem_free(sql); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + if (err == DB_DUPLICATE_KEY) { + ut_print_timestamp(stderr); + fputs( + " InnoDB: Error; possible reasons:\n" + "InnoDB: 1) Table rename would cause two FOREIGN KEY constraints\n" + "InnoDB: to have the same internal name in case-insensitive comparison.\n" + "InnoDB: 2) table ", stderr); + ut_print_name(stderr, trx, new_name); + fputs(" exists in the InnoDB internal data\n" + "InnoDB: dictionary though MySQL is trying rename table ", stderr); + ut_print_name(stderr, trx, old_name); + fputs(" to it.\n" + "InnoDB: Have you deleted the .frm file and not used DROP TABLE?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n" + "InnoDB: If table ", stderr); + ut_print_name(stderr, trx, new_name); + fputs( + " is a temporary table #sql..., then it can be that\n" + "InnoDB: there are still queries running on the table, and it will be\n" + "InnoDB: dropped automatically when the queries end.\n" + "InnoDB: You can drop the orphaned table inside InnoDB by\n" + "InnoDB: creating an InnoDB table with the same name in another\n" + "InnoDB: database and copying the .frm file to the current database.\n" + "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n" + "InnoDB: succeed.\n", stderr); + } + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + /* The following call will also rename the .ibd data file if + the table is stored in a single-table tablespace */ + + success = dict_table_rename_in_cache(table, new_name, + !row_is_mysql_tmp_table_name(new_name)); + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); + fputs(" InnoDB: Error in table rename, cannot rename ", + stderr); + ut_print_name(stderr, trx, old_name); + fputs(" to ", stderr); + ut_print_name(stderr, trx, new_name); + putc('\n', stderr); + err = DB_ERROR; + + goto funct_exit; + } + + err = dict_load_foreigns(new_name, trx->check_foreigns); + + if (row_is_mysql_tmp_table_name(old_name)) { + + /* MySQL is doing an ALTER TABLE command and it + renames the created temporary table to the name + of the original table. In the ALTER TABLE we maybe + created some FOREIGN KEY constraints for the temporary + table. But we want to load also the foreign key + constraint definitions for the original table name. */ + + if (err != DB_SUCCESS) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: in ALTER TABLE ", + stderr); + ut_print_name(stderr, trx, new_name); + fputs("\n" + "InnoDB: has or is referenced in foreign key constraints\n" + "InnoDB: which are not compatible with the new table definition.\n", + stderr); + + ut_a(dict_table_rename_in_cache(table, + old_name, FALSE)); + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, + NULL); + trx->error_state = DB_SUCCESS; + } + } else { + if (err != DB_SUCCESS) { + + ut_print_timestamp(stderr); + + fputs( + " InnoDB: Error: in RENAME TABLE table ", + stderr); + ut_print_name(stderr, trx, new_name); + fputs("\n" + "InnoDB: is referenced in foreign key constraints\n" + "InnoDB: which are not compatible with the new table definition.\n", + stderr); + + ut_a(dict_table_rename_in_cache(table, + old_name, FALSE)); + + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, + NULL); + trx->error_state = DB_SUCCESS; + } + } + } +funct_exit: + if (!recovering_temp_table) { + row_mysql_unlock_data_dictionary(trx); + } + + if (graph) { + que_graph_free(graph); + } + + if (heap) { + mem_heap_free(heap); + } + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Checks that the index contains entries in an ascending order, unique +constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. */ +static +ibool +row_scan_and_check_index( +/*=====================*/ + /* out: TRUE if ok */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL */ + dict_index_t* index, /* in: index */ + ulint* n_rows) /* out: number of entries seen in the + current consistent read */ +{ + dtuple_t* prev_entry = NULL; + ulint matched_fields; + ulint matched_bytes; + byte* buf; + ulint ret; + rec_t* rec; + ibool is_ok = TRUE; + int cmp; + ibool contains_null; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + *n_rows = 0; + + buf = mem_alloc(UNIV_PAGE_SIZE); + heap = mem_heap_create(100); + + /* Make a dummy template in prebuilt, which we will use + in scanning the index entries */ + + prebuilt->index = index; + prebuilt->sql_stat_start = TRUE; + prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE; + prebuilt->n_template = 0; + prebuilt->need_to_access_clustered = FALSE; + + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + prebuilt->select_lock_type = LOCK_NONE; + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0); +loop: + if (ret != DB_SUCCESS) { + + mem_free(buf); + mem_heap_free(heap); + + return(is_ok); + } + + *n_rows = *n_rows + 1; + + /* row_search... returns the index record in buf, record origin offset + within buf stored in the first 4 bytes, because we have built a dummy + template */ + + rec = buf + mach_read_from_4(buf); + + if (prev_entry != NULL) { + matched_fields = 0; + matched_bytes = 0; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, + &matched_fields, + &matched_bytes); + contains_null = FALSE; + + /* In a unique secondary index we allow equal key values if + they contain SQL NULLs */ + + for (i = 0; + i < dict_index_get_n_ordering_defined_by_user(index); + i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(prev_entry, i))) { + + contains_null = TRUE; + } + } + + if (cmp > 0) { + fputs("InnoDB: index records in a wrong order in ", + stderr); + not_ok: + dict_index_name_print(stderr, + prebuilt->trx, index); + fputs("\n" + "InnoDB: prev record ", stderr); + dtuple_print(stderr, prev_entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + is_ok = FALSE; + } else if ((index->type & DICT_UNIQUE) + && !contains_null + && matched_fields >= + dict_index_get_n_ordering_defined_by_user(index)) { + + fputs("InnoDB: duplicate key in ", stderr); + goto not_ok; + } + } + + mem_heap_empty(heap); + offsets = offsets_; + + prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); + + goto loop; +} + +/************************************************************************* +Checks a table for corruption. */ + +ulint +row_check_table_for_mysql( +/*======================*/ + /* out: DB_ERROR or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + dict_index_t* index; + ulint n_rows; + ulint n_rows_in_table = ULINT_UNDEFINED; + ulint ret = DB_SUCCESS; + ulint old_isolation_level; + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" +"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n" +"InnoDB: table %s does not exist.\n" +"InnoDB: Have you deleted the .ibd file from the database directory under\n" +"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n" +"InnoDB: Look from\n" +"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" +"InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + prebuilt->trx->op_info = "checking table"; + + old_isolation_level = prebuilt->trx->isolation_level; + + /* We must run the index record counts at an isolation level + >= READ COMMITTED, because a dirty read can see a wrong number + of records in some index; to play safe, we use always + REPEATABLE READ here */ + + prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + /* Enlarge the fatal lock wait timeout during CHECK TABLE. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + index = dict_table_get_first_index(table); + + while (index != NULL) { + /* fputs("Validating index ", stderr); + ut_print_name(stderr, index->name); + putc('\n', stderr); */ + + if (!btr_validate_tree(index->tree)) { + ret = DB_ERROR; + } else { + if (!row_scan_and_check_index(prebuilt, + index, &n_rows)) { + ret = DB_ERROR; + } + + /* fprintf(stderr, "%lu entries in index %s\n", n_rows, + index->name); */ + + if (index == dict_table_get_first_index(table)) { + n_rows_in_table = n_rows; + } else if (n_rows != n_rows_in_table) { + + ret = DB_ERROR; + + fputs("Error: ", stderr); + dict_index_name_print(stderr, + prebuilt->trx, index); + fprintf(stderr, + " contains %lu entries, should be %lu\n", + (ulong) n_rows, + (ulong) n_rows_in_table); + } + } + + index = dict_table_get_next_index(index); + } + + /* Restore the original isolation level */ + prebuilt->trx->isolation_level = old_isolation_level; + + /* We validate also the whole adaptive hash index for all tables + at every CHECK TABLE */ + + if (!btr_search_validate()) { + + ret = DB_ERROR; + } + + /* Restore the fatal lock wait timeout after CHECK TABLE. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + prebuilt->trx->op_info = ""; + + return(ret); +} diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.c new file mode 100644 index 00000000000..5893e016011 --- /dev/null +++ b/storage/innobase/row/row0purge.c @@ -0,0 +1,671 @@ +/****************************************************** +Purge obsolete records + +(c) 1997 Innobase Oy + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#include "row0purge.h" + +#ifdef UNIV_NONINL +#include "row0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0vers.h" +#include "row0mysql.h" +#include "log0log.h" + +/************************************************************************ +Creates a purge node to a query graph. */ + +purge_node_t* +row_purge_node_create( +/*==================*/ + /* out, own: purge node */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + purge_node_t* node; + + ut_ad(parent && heap); + + node = mem_heap_alloc(heap, sizeof(purge_node_t)); + + node->common.type = QUE_NODE_PURGE; + node->common.parent = parent; + + node->heap = mem_heap_create(256); + + return(node); +} + +/*************************************************************** +Repositions the pcur in the purge node on the clustered index record, +if found. */ +static +ibool +row_purge_reposition_pcur( +/*======================*/ + /* out: TRUE if the record was found */ + ulint mode, /* in: latching mode */ + purge_node_t* node, /* in: row purge node */ + mtr_t* mtr) /* in: mtr */ +{ + ibool found; + + if (node->found_clust) { + found = btr_pcur_restore_position(mode, &(node->pcur), mtr); + + return(found); + } + + found = row_search_on_row_ref(&(node->pcur), mode, node->table, + node->ref, mtr); + node->found_clust = found; + + if (found) { + btr_pcur_store_position(&(node->pcur), mtr); + } + + return(found); +} + +/*************************************************************** +Removes a delete marked clustered index record if possible. */ +static +ibool +row_purge_remove_clust_if_poss_low( +/*===============================*/ + /* out: TRUE if success, or if not found, or + if modified after the delete marking */ + purge_node_t* node, /* in: row purge node */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ibool success; + ulint err; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + index = dict_table_get_first_index(node->table); + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + mtr_start(&mtr); + + success = row_purge_reposition_pcur(mode, node, &mtr); + + if (!success) { + /* The record is already removed */ + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + rec = btr_pcur_get_rec(pcur); + + if (0 != ut_dulint_cmp(node->roll_ptr, + row_get_rec_roll_ptr(rec, index, rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap)))) { + if (heap) { + mem_heap_free(heap); + } + /* Someone else has modified the record later: do not remove */ + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + if (heap) { + mem_heap_free(heap); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, FALSE, &mtr); + + if (err == DB_SUCCESS) { + success = TRUE; + } else if (err == DB_OUT_OF_FILE_SPACE) { + success = FALSE; + } else { + ut_error; + } + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(success); +} + +/*************************************************************** +Removes a clustered index record if it has not been modified after the delete +marking. */ +static +void +row_purge_remove_clust_if_poss( +/*===========================*/ + purge_node_t* node) /* in: row purge node */ +{ + ibool success; + ulint n_tries = 0; + +/* fputs("Purge: Removing clustered record\n", stderr); */ + + success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/*************************************************************** +Removes a secondary index entry if possible. */ +static +ibool +row_purge_remove_sec_if_poss_low( +/*=============================*/ + /* out: TRUE if success or if not found */ + purge_node_t* node, /* in: row purge node */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode) /* in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has = 0; /* remove warning */ + ibool found; + ulint err; + mtr_t mtr; + mtr_t* mtr_vers; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + if (!found) { + /* Not found */ + + /* fputs("PURGE:........sec entry not found\n", stderr); */ + /* dtuple_print(entry); */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(TRUE); + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* We should remove the index record if no later version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should do nothing. */ + + mtr_vers = mem_alloc(sizeof(mtr_t)); + + mtr_start(mtr_vers); + + success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, mtr_vers); + + if (success) { + old_has = row_vers_old_has_index_entry(TRUE, + btr_pcur_get_rec(&(node->pcur)), + mtr_vers, index, entry); + } + + btr_pcur_commit_specify_mtr(&(node->pcur), mtr_vers); + + mem_free(mtr_vers); + + if (!success || !old_has) { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + FALSE, &mtr); + if (err == DB_SUCCESS) { + success = TRUE; + } else if (err == DB_OUT_OF_FILE_SPACE) { + success = FALSE; + } else { + ut_error; + } + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(success); +} + +/*************************************************************** +Removes a secondary index entry if possible. */ +UNIV_INLINE +void +row_purge_remove_sec_if_poss( +/*=========================*/ + purge_node_t* node, /* in: row purge node */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + ibool success; + ulint n_tries = 0; + +/* fputs("Purge: Removing secondary record\n", stderr); */ + + success = row_purge_remove_sec_if_poss_low(node, index, entry, + BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_sec_if_poss_low(node, index, entry, + BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/*************************************************************** +Purges a delete marking of a record. */ +static +void +row_purge_del_mark( +/*===============*/ + purge_node_t* node) /* in: row purge node */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + + ut_ad(node); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + /* Build the index entry */ + entry = row_build_index_entry(node->row, index, heap); + + row_purge_remove_sec_if_poss(node, index, entry); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + row_purge_remove_clust_if_poss(node); +} + +/*************************************************************** +Purges an update of an existing record. Also purges an update of a delete +marked record if that record contained an externally stored field. */ +static +void +row_purge_upd_exist_or_extern( +/*==========================*/ + purge_node_t* node) /* in: row purge node */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + upd_field_t* ufield; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + ulint internal_offset; + byte* data_field; + ulint data_field_len; + ulint i; + mtr_t mtr; + + ut_ad(node); + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + goto skip_secondaries; + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field_binary(NULL, node->index, + node->update)) { + /* Build the older version of the index entry */ + entry = row_build_index_entry(node->row, index, heap); + + row_purge_remove_sec_if_poss(node, index, entry); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + +skip_secondaries: + /* Free possible externally stored fields */ + for (i = 0; i < upd_get_n_fields(node->update); i++) { + + ufield = upd_get_nth_field(node->update, i); + + if (ufield->extern_storage) { + /* We use the fact that new_val points to + node->undo_rec and get thus the offset of + dfield data inside the unod record. Then we + can calculate from node->roll_ptr the file + address of the new_val data */ + + internal_offset = ((byte*)ufield->new_val.data) + - node->undo_rec; + + ut_a(internal_offset < UNIV_PAGE_SIZE); + + trx_undo_decode_roll_ptr(node->roll_ptr, + &is_insert, &rseg_id, + &page_no, &offset); + mtr_start(&mtr); + + /* We have to acquire an X-latch to the clustered + index tree */ + + index = dict_table_get_first_index(node->table); + + mtr_x_lock(dict_tree_get_lock(index->tree), &mtr); + + /* NOTE: we must also acquire an X-latch to the + root page of the tree. We will need it when we + free pages from the tree. If the tree is of height 1, + the tree X-latch does NOT protect the root page, + because it is also a leaf page. Since we will have a + latch on an undo log page, we would break the + latching order if we would only later latch the + root page of such a tree! */ + + btr_root_get(index->tree, &mtr); + + /* We assume in purge of externally stored fields + that the space id of the undo log record is 0! */ + + data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr) + + offset + internal_offset; + +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(buf_frame_align(data_field), + SYNC_TRX_UNDO_PAGE); +#endif /* UNIV_SYNC_DEBUG */ + + data_field_len = ufield->new_val.len; + + btr_free_externally_stored_field(index, data_field, + data_field_len, FALSE, &mtr); + mtr_commit(&mtr); + } + } +} + +/*************************************************************** +Parses the row reference and other info in a modify undo log record. */ +static +ibool +row_purge_parse_undo_rec( +/*=====================*/ + /* out: TRUE if purge operation required: + NOTE that then the CALLER must unfreeze + data dictionary! */ + purge_node_t* node, /* in: row undo node */ + ibool* updated_extern, + /* out: TRUE if an externally stored field + was updated */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + trx_t* trx; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + + ut_ad(node && thr); + + trx = thr_get_trx(thr); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + updated_extern, &undo_no, &table_id); + node->rec_type = type; + + if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) { + + return(FALSE); + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + node->table = NULL; + + if (type == TRX_UNDO_UPD_EXIST_REC + && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) { + + /* Purge requires no changes to indexes: we may return */ + + return(FALSE); + } + + /* Prevent DROP TABLE etc. from running when we are doing the purge + for this row */ + + row_mysql_freeze_data_dictionary(trx); + + mutex_enter(&(dict_sys->mutex)); + + node->table = dict_table_get_on_id_low(table_id, trx); + + mutex_exit(&(dict_sys->mutex)); + + if (node->table == NULL) { + /* The table has been dropped: no need to do purge */ + + row_mysql_unfreeze_data_dictionary(trx); + + return(FALSE); + } + + if (node->table->ibd_file_missing) { + /* We skip purge of missing .ibd files */ + + node->table = NULL; + + row_mysql_unfreeze_data_dictionary(trx); + + return(FALSE); + } + + clust_index = dict_table_get_first_index(node->table); + + if (clust_index == NULL) { + /* The table was corrupt in the data dictionary */ + + row_mysql_unfreeze_data_dictionary(trx); + + return(FALSE); + } + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + + /* Read to the partial row the fields that occur in indexes */ + + if (!cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + ptr = trx_undo_rec_get_partial_row(ptr, clust_index, + &(node->row), node->heap); + } + + return(TRUE); +} + +/*************************************************************** +Fetches an undo log record and does the purge for the recorded operation. +If none left, or the current purge completed, returns the control to the +parent node, which is always a query thread node. */ +static +ulint +row_purge( +/*======*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr) /* in: query thread */ +{ + dulint roll_ptr; + ibool purge_needed; + ibool updated_extern; + trx_t* trx; + + ut_ad(node && thr); + + trx = thr_get_trx(thr); + + node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr, + &(node->reservation), + node->heap); + if (!node->undo_rec) { + /* Purge completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + + if (node->undo_rec == &trx_purge_dummy_rec) { + purge_needed = FALSE; + } else { + purge_needed = row_purge_parse_undo_rec(node, &updated_extern, + thr); + /* If purge_needed == TRUE, we must also remember to unfreeze + data dictionary! */ + } + + if (purge_needed) { + node->found_clust = FALSE; + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + row_purge_del_mark(node); + + } else if (updated_extern + || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + row_purge_upd_exist_or_extern(node); + } + + if (node->found_clust) { + btr_pcur_close(&(node->pcur)); + } + + row_mysql_unfreeze_data_dictionary(trx); + } + + /* Do some cleanup */ + trx_purge_rec_release(node->reservation); + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(DB_SUCCESS); +} + +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. */ + +que_thr_t* +row_purge_step( +/*===========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + purge_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + + err = row_purge(node, thr); + + ut_ad(err == DB_SUCCESS); + + return(thr); +} diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c new file mode 100644 index 00000000000..a6d3f1d5ab0 --- /dev/null +++ b/storage/innobase/row/row0row.c @@ -0,0 +1,730 @@ +/****************************************************** +General row routines + +(c) 1996 Innobase Oy + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" + +#ifdef UNIV_NONINL +#include "row0row.ic" +#endif + +#include "dict0dict.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" + +/************************************************************************* +Reads the trx id or roll ptr field from a clustered index record: this function +is slower than the specialized inline functions. */ + +dulint +row_get_rec_sys_field( +/*==================*/ + /* out: value of the field */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint pos; + byte* field; + ulint len; + + ut_ad(index->type & DICT_CLUSTERED); + + pos = dict_index_get_sys_col_pos(index, type); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + if (type == DATA_TRX_ID) { + + return(trx_read_trx_id(field)); + } else { + ut_ad(type == DATA_ROLL_PTR); + + return(trx_read_roll_ptr(field)); + } +} + +/************************************************************************* +Sets the trx id or roll ptr field in a clustered index record: this function +is slower than the specialized inline functions. */ + +void +row_set_rec_sys_field( +/*==================*/ + /* out: value of the field */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + dulint val) /* in: value to set */ +{ + ulint pos; + byte* field; + ulint len; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, type); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + if (type == DATA_TRX_ID) { + + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + + trx_write_roll_ptr(field, val); + } +} + +/********************************************************************* +When an insert to a table is performed, this function builds the entry which +has to be inserted to an index on the table. */ + +dtuple_t* +row_build_index_entry( +/*==================*/ + /* out: index entry which should be inserted */ + dtuple_t* row, /* in: row which should be inserted to the + table */ + dict_index_t* index, /* in: index on the table */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the index entry is allocated */ +{ + dtuple_t* entry; + ulint entry_len; + dict_field_t* ind_field; + dfield_t* dfield; + dfield_t* dfield2; + dict_col_t* col; + ulint i; + ulint storage_len; + dtype_t* cur_type; + + ut_ad(row && index && heap); + ut_ad(dtuple_check_typed(row)); + + entry_len = dict_index_get_n_fields(index); + entry = dtuple_create(heap, entry_len); + + if (index->type & DICT_UNIVERSAL) { + dtuple_set_n_fields_cmp(entry, entry_len); + } else { + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + } + + for (i = 0; i < entry_len; i++) { + ind_field = dict_index_get_nth_field(index, i); + col = ind_field->col; + + dfield = dtuple_get_nth_field(entry, i); + + dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_copy(dfield, dfield2); + + /* If a column prefix index, take only the prefix */ + if (ind_field->prefix_len > 0 + && dfield_get_len(dfield2) != UNIV_SQL_NULL) { + + cur_type = dict_col_get_type( + dict_field_get_col(ind_field)); + + storage_len = dtype_get_at_most_n_mbchars( + cur_type, + ind_field->prefix_len, + dfield_get_len(dfield2), dfield2->data); + + dfield_set_len(dfield, storage_len); + } + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*********************************************************************** +An inverse function to dict_row_build_index_entry. Builds a row from a +record in a clustered index. */ + +dtuple_t* +row_build( +/*======*/ + /* out, own: row built; see the NOTE below! */ + ulint type, /* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or + ROW_COPY_ALSO_EXTERNALS, + the two last copy also the data fields to + heap as the first only places pointers to + data fields on the index page, and thus is + more efficient */ + dict_index_t* index, /* in: clustered index */ + rec_t* rec, /* in: record in the clustered index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dtuple_t* row; + dict_table_t* table; + dict_field_t* ind_field; + dict_col_t* col; + dfield_t* dfield; + ulint n_fields; + byte* field; + ulint len; + ulint row_len; + byte* buf; + ulint i; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(index && rec && heap); + ut_ad(index->type & DICT_CLUSTERED); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + if (type != ROW_COPY_POINTERS) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, (ulint*) offsets); + } + + table = index->table; + row_len = dict_table_get_n_cols(table); + + row = dtuple_create(heap, row_len); + + dtuple_set_info_bits(row, rec_get_info_bits(rec, table->comp)); + + n_fields = rec_offs_n_fields(offsets); + + dict_table_copy_types(row, table); + + for (i = 0; i < n_fields; i++) { + ind_field = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len == 0) { + + col = dict_field_get_col(ind_field); + dfield = dtuple_get_nth_field(row, + dict_col_get_no(col)); + field = rec_get_nth_field(rec, offsets, i, &len); + + if (type == ROW_COPY_ALSO_EXTERNALS + && rec_offs_nth_extern(offsets, i)) { + + field = btr_rec_copy_externally_stored_field( + rec, offsets, i, &len, heap); + } + + dfield_set_data(dfield, field, len); + } + } + + ut_ad(dtuple_check_typed(row)); + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(row); +} + +/*********************************************************************** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. */ + +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + /* out, own: index entry built; see the + NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap as the latter only places pointers to + data fields on the index page */ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the dtuple is used! */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + ulint i; + byte* field; + ulint len; + ulint rec_len; + byte* buf; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(rec && heap && index); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + rec_len = rec_offs_n_fields(offsets); + + entry = dtuple_create(heap, rec_len); + + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + ut_ad(rec_len == dict_index_get_n_fields(index)); + + dict_index_copy_types(entry, index, rec_len); + + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); + + for (i = 0; i < rec_len; i++) { + + dfield = dtuple_get_nth_field(entry, i); + field = rec_get_nth_field(rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + } + + ut_ad(dtuple_check_typed(entry)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(entry); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ + +dtuple_t* +row_build_row_ref( +/*==============*/ + /* out, own: row reference built; see the + NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dtuple_t* ref; + byte* field; + ulint len; + ulint ref_len; + ulint pos; + byte* buf; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(index && rec && heap); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = + dict_index_get_nth_field(clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dfield_get_type(dfield), + clust_col_prefix_len, len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(ref); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ + +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; see the + NOTE below! */ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in the index; + NOTE: the data fields in ref will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + trx_t* trx) /* in: transaction */ +{ + dict_index_t* clust_index; + dfield_t* dfield; + byte* field; + ulint len; + ulint ref_len; + ulint pos; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_a(ref && index && rec); + + if (!index->table) { + fputs("InnoDB: table ", stderr); + notfound: + ut_print_name(stderr, trx, index->table_name); + fputs(" for index ", stderr); + ut_print_name(stderr, trx, index->name); + fputs(" not found\n", stderr); + ut_error; + } + + clust_index = dict_table_get_first_index(index->table); + + if (!clust_index) { + fputs("InnoDB: clust index for table ", stderr); + goto notfound; + } + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = + dict_index_get_nth_field(clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dfield_get_type(dfield), + clust_col_prefix_len, len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (heap) { + mem_heap_free(heap); + } +} + +/*********************************************************************** +From a row build a row reference with which we can search the clustered +index record. */ + +void +row_build_row_ref_from_row( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; see the + NOTE below! ref must have the right number + of fields! */ + dict_table_t* table, /* in: table */ + dtuple_t* row) /* in: row + NOTE: the data fields in ref will point + directly into data of this row */ +{ + dict_index_t* clust_index; + dict_field_t* field; + dfield_t* dfield; + dfield_t* dfield2; + dict_col_t* col; + ulint ref_len; + ulint i; + dtype_t* cur_type; + + ut_ad(ref && table && row); + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + field = dict_index_get_nth_field(clust_index, i); + + col = dict_field_get_col(field); + + dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_copy(dfield, dfield2); + + if (field->prefix_len > 0 + && dfield->len != UNIV_SQL_NULL) { + + cur_type = dict_col_get_type( + dict_field_get_col(field)); + + dfield->len = dtype_get_at_most_n_mbchars( + cur_type, + field->prefix_len, + dfield->len, dfield->data); + } + } + + ut_ad(dtuple_check_typed(ref)); +} + +/******************************************************************* +Searches the clustered index record for a row, if we have the row reference. */ + +ibool +row_search_on_row_ref( +/*==================*/ + /* out: TRUE if found */ + btr_pcur_t* pcur, /* in/out: persistent cursor, which must + be closed by the caller */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + dict_table_t* table, /* in: table */ + dtuple_t* ref, /* in: row reference */ + mtr_t* mtr) /* in: mtr */ +{ + ulint low_match; + rec_t* rec; + dict_index_t* index; + page_t* page; + + ut_ad(dtuple_check_typed(ref)); + + index = dict_table_get_first_index(table); + + ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index)); + + btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr); + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + page = buf_frame_align(rec); + + if (rec == page_get_infimum_rec(page)) { + + return(FALSE); + } + + if (low_match != dtuple_get_n_fields(ref)) { + + return(FALSE); + } + + return(TRUE); +} + +/************************************************************************* +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. */ + +rec_t* +row_get_clust_rec( +/*==============*/ + /* out: record or NULL, if no record found */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: secondary index */ + dict_index_t** clust_index,/* out: clustered index */ + mtr_t* mtr) /* in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* ref; + dict_table_t* table; + btr_pcur_t pcur; + ibool found; + rec_t* clust_rec; + + ut_ad((index->type & DICT_CLUSTERED) == 0); + + table = index->table; + + heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap); + + found = row_search_on_row_ref(&pcur, mode, table, ref, mtr); + + clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL; + + mem_heap_free(heap); + + btr_pcur_close(&pcur); + + *clust_index = dict_table_get_first_index(table); + + return(clust_rec); +} + +/******************************************************************* +Searches an index record. */ + +ibool +row_search_index_entry( +/*===================*/ + /* out: TRUE if found */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /* in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /* in: mtr */ +{ + ulint n_fields; + ulint low_match; + page_t* page; + rec_t* rec; + + ut_ad(dtuple_check_typed(entry)); + + btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr); + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + page = buf_frame_align(rec); + + n_fields = dtuple_get_n_fields(entry); + + if (rec == page_get_infimum_rec(page)) { + + return(FALSE); + } + + if (low_match != n_fields) { + /* Not found */ + + return(FALSE); + } + + return(TRUE); +} diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.c new file mode 100644 index 00000000000..94cf82d6a3d --- /dev/null +++ b/storage/innobase/row/row0sel.c @@ -0,0 +1,4066 @@ +/******************************************************* +Select + +(c) 1997 Innobase Oy + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" + +#ifdef UNIV_NONINL +#include "row0sel.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" +#include "read0read.h" +#include "buf0lru.h" + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/************************************************************************ +Returns TRUE if the user-defined column values in a secondary index record +are alphabetically the same as the corresponding columns in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! */ +static +ibool +row_sel_sec_rec_is_for_clust_rec( +/*=============================*/ + /* out: TRUE if the secondary + record is equal to the corresponding + fields in the clustered record, + when compared with collation */ + rec_t* sec_rec, /* in: secondary index record */ + dict_index_t* sec_index, /* in: secondary index */ + rec_t* clust_rec, /* in: clustered index record */ + dict_index_t* clust_index) /* in: clustered index */ +{ + dict_field_t* ifield; + dict_col_t* col; + byte* sec_field; + ulint sec_len; + byte* clust_field; + ulint clust_len; + ulint n; + ulint i; + dtype_t* cur_type; + mem_heap_t* heap = NULL; + ulint clust_offsets_[REC_OFFS_NORMAL_SIZE]; + ulint sec_offsets_[REC_OFFS_SMALL_SIZE]; + ulint* clust_offs = clust_offsets_; + ulint* sec_offs = sec_offsets_; + ibool is_equal = TRUE; + + *clust_offsets_ = (sizeof clust_offsets_) / sizeof *clust_offsets_; + *sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_; + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + ULINT_UNDEFINED, &heap); + + n = dict_index_get_n_ordering_defined_by_user(sec_index); + + for (i = 0; i < n; i++) { + ifield = dict_index_get_nth_field(sec_index, i); + col = dict_field_get_col(ifield); + + clust_field = rec_get_nth_field(clust_rec, clust_offs, + dict_col_get_clust_pos(col), + &clust_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); + + if (ifield->prefix_len > 0 + && clust_len != UNIV_SQL_NULL) { + + cur_type = dict_col_get_type( + dict_field_get_col(ifield)); + + clust_len = dtype_get_at_most_n_mbchars( + cur_type, + ifield->prefix_len, + clust_len, (char*) clust_field); + } + + if (0 != cmp_data_data(dict_col_get_type(col), + clust_field, clust_len, + sec_field, sec_len)) { + is_equal = FALSE; + goto func_exit; + } + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(is_equal); +} + +/************************************************************************* +Creates a select node struct. */ + +sel_node_t* +sel_node_create( +/*============*/ + /* out, own: select node struct */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + sel_node_t* node; + + node = mem_heap_alloc(heap, sizeof(sel_node_t)); + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->select_will_do_update = FALSE; + node->latch_mode = BTR_SEARCH_LEAF; + + node->plans = NULL; + + return(node); +} + +/************************************************************************* +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ + +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /* in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/************************************************************************* +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/************************************************************************* +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /* in: first variable in a list of variables */ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + exp = node->select_list; + + while (var) { + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + var = que_node_get_next(var); + } +} + +/************************************************************************* +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /* in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + func_node = node->select_list; + + while (func_node) { + eval_node_set_int_val(func_node, 0); + + func_node = que_node_get_next(func_node); + } + + node->aggregate_already_fetched = FALSE; +} + +/************************************************************************* +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /* in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/************************************************************************* +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /* in: record index */ + rec_t* rec, /* in: record in a clustered or non-clustered + index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + byte* data; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (index->type & DICT_CLUSTERED) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + data = rec_get_nth_field(rec, offsets, field_no, &len); + + if (column->copy_val) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /* in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH + * sizeof(sel_buf_t)); + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + + sel_buf->val_buf_size = 0; + } +} + +/************************************************************************* +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ + +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + mem_free(sel_buf->data); + } + } +} + +/************************************************************************* +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_pop_prefetched_row( +/*===================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); +#ifdef UNIV_DEBUG + dfield_set_data(val, NULL, 0); +#endif + goto next_col; + } + + ut_ad(column->prefetch_buf); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = dfield_get_data(val); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/************************************************************************* +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_push_prefetched_row( +/*====================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + + goto next_col; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = dfield_get_data(val); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers( +/*====================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, plan->index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (plan->old_vers_heap) { + mem_heap_empty(plan->old_vers_heap); + } else { + plan->old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read(rec, mtr, plan->index, + offsets, read_view, offset_heap, + plan->old_vers_heap, old_vers); + return(err); +} + +/************************************************************************* +Tests the conditions which determine when the index segment we are searching +through has been exhausted. */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(cond->args); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Tests the other conditions. */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. */ +static +ulint +row_sel_get_clust_rec( +/*==================*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select_node */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a non-clustered index */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + offsets, ULINT_UNDEFINED, &heap); + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); + + index = dict_table_get_first_index(plan->table); + + btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE, + node->latch_mode, &(plan->clust_pcur), + 0, mtr); + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&(plan->clust_pcur)) + < dict_index_get_n_unique(index)) { + + ut_a(rec_get_deleted_flag(rec, plan->table->comp)); + ut_a(node->read_view); + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + clust_rec = NULL; + + goto func_exit; + } + + offsets = rec_get_offsets(clust_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = lock_clust_rec_read_check_and_lock(0, + clust_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers(node->read_view, plan, + clust_rec, &offsets, &heap, + &old_vers, mtr); + if (err != DB_SUCCESS) { + + goto err_exit; + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + goto func_exit; + } + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if ((old_vers || rec_get_deleted_flag(rec, plan->table->comp)) + && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, + clust_rec, index)) { + clust_rec = NULL; + goto func_exit; + } + } + + /* Fetch the columns needed in test conditions */ + + row_sel_fetch_columns(index, clust_rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); +func_exit: + *out_rec = clust_rec; + err = DB_SUCCESS; +err_exit: + if (heap) { + mem_heap_free(heap); + } + return(err); +} + +/************************************************************************* +Sets a lock on a record. */ +UNIV_INLINE +ulint +sel_set_rec_lock( +/*=============*/ + /* out: DB_SUCCESS or error code */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint mode, /* in: lock mode */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + ulint err; + + trx = thr_get_trx(thr); + + if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) { + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + } + + if (index->type & DICT_CLUSTERED) { + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); + } + + return(err); +} + +/************************************************************************* +Opens a pcur to a table index. */ +static +void +row_sel_open_pcur( +/*==============*/ + sel_node_t* node, /* in: select node */ + plan_t* plan, /* in: table plan */ + ibool search_latch_locked, + /* in: TRUE if the thread currently + has the search latch locked in + s-mode */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint has_search_latch = 0; /* RW_S_LATCH or 0 */ + ulint i; + + if (search_latch_locked) { + has_search_latch = RW_S_LATCH; + } + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + /* Open pcur to the index */ + + btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, + node->latch_mode, &(plan->pcur), + has_search_latch, mtr); + } else { + /* Open the cursor to the start or the end of the index + (FALSE: no init) */ + + btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode, + &(plan->pcur), FALSE, mtr); + } + + ut_ad(plan->n_rows_prefetched == 0); + ut_ad(plan->n_rows_fetched == 0); + ut_ad(plan->cursor_at_end == FALSE); + + plan->pcur_is_open = TRUE; +} + +/************************************************************************* +Restores a stored pcur position to a table index. */ +static +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + /* out: TRUE if the cursor should be moved to + the next record after we return from this + function (moved to the previous, in the case + of a descending cursor) without processing + again the current cursor record */ + sel_node_t* node, /* in: select node */ + plan_t* plan, /* in: table plan */ + mtr_t* mtr) /* in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = btr_pcur_restore_position(node->latch_mode, + &(plan->pcur), mtr); + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(TRUE); +} + +/************************************************************************* +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /* in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +/************************************************************************* +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ + sel_node_t* node, /* in: select node for a consistent read */ + plan_t* plan, /* in: plan for a unique search in clustered + index */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + index = plan->index; + + ut_ad(node->read_view); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + row_sel_open_pcur(node, plan, TRUE, mtr); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (index->type & DICT_CLUSTERED) { + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + ret = SEL_RETRY; + goto func_exit; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { + + ret = SEL_RETRY; + goto func_exit; + } + + /* Test deleted flag. Fetch the columns needed in test conditions. */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + if (rec_get_deleted_flag(rec, plan->table->comp)) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + ut_ad(plan->pcur.latch_mode == node->latch_mode); + + plan->n_rows_fetched++; +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(SEL_FOUND); +} + +/************************************************************************* +Performs a select step. */ +static +ulint +row_sel( +/*====*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + ibool search_latch_locked; + ibool consistent_read; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool leaf_contains_updates = FALSE; + /* TRUE if select_will_do_update is + TRUE and the current clustered index + leaf page has been updated during + the current mtr: mtr must be committed + at the same time as the leaf x-latch + is released */ + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + ulint found_flag; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(thr->run_node == node); + + search_latch_locked = FALSE; + + if (node->read_view) { + /* In consistent reads, we try to do with the hash index and + not to use the buffer page get. This is to reduce memory bus + load resulting from semaphore operations. The search latch + will be s-locked when we access an index with a unique search + condition, but not locked when we access an index with a + less selective search condition. */ + + consistent_read = TRUE; + } else { + consistent_read = FALSE; + } + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(leaf_contains_updates == FALSE); + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr_start(&mtr); + + if (consistent_read && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust) { + if (!search_latch_locked) { + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; + } else if (btr_search_latch.writer_is_wait_ex) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often + kept for some 10 searches before being released, + a waiting x-latch request would block other threads + from acquiring an s-latch for a long time, lowering + performance significantly in multiprocessors. */ + + rw_lock_s_unlock(&btr_search_latch); + rw_lock_s_lock(&btr_search_latch); + } + + found_flag = row_sel_try_search_shortcut(node, plan, &mtr); + + if (found_flag == SEL_FOUND) { + + goto next_table; + + } else if (found_flag == SEL_EXHAUSTED) { + + goto table_exhausted; + } + + ut_ad(found_flag == SEL_RETRY); + + plan_reset_cursor(plan); + + mtr_commit(&mtr); + mtr_start(&mtr); + } + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + + search_latch_locked = FALSE; + } + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + + row_sel_open_pcur(node, plan, search_latch_locked, &mtr); + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && (rec != page_get_supremum_rec(buf_frame_align(rec)))) { + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!consistent_read) { + + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(next_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + + goto lock_wait_or_error; + } + } + } + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (!consistent_read) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } else if (plan->mixed_index) { + /* We have to check if the record in a mixed cluster belongs + to this table */ + + if (!dict_is_mixed_table_rec(plan->table, rec)) { + + goto next_rec; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (consistent_read) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (index->type & DICT_CLUSTERED) { + + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers(node->read_view, + plan, rec, + &offsets, &heap, + &old_vers, &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); + row_sel_fetch_columns(index, rec, + offsets, + UT_LIST_GET_FIRST(plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, + node->read_view)) { + cons_read_requires_clust_rec = TRUE; + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec, plan->table->comp) + && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(consistent_read); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec, plan->table->comp)) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == node->latch_mode); + + if (node->select_will_do_update) { + /* This is a searched update and we can do the update in-place, + saving CPU time */ + + row_upd_in_place_in_select(node, thr, &mtr); + + leaf_contains_updates = TRUE; + + /* When the database is in the online backup mode, the number + of log records for a single mtr should be small: increment the + cost counter to ensure it */ + + cost_counter += 1 + (SEL_COST_LIMIT / 8); + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_push_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_pop_prefetched_row(plan); + + goto next_table; + } + +next_rec: + ut_ad(!search_latch_locked); + + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (leaf_contains_updates + && btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) { + + /* We must commit &mtr if we are moving to a different page, + because we have done updates to the x-latched leaf page, and + the latch would be released in btr_pcur_move_to_next, without + &mtr getting committed there */ + + ut_ad(node->asc); + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + err = DB_SUCCESS; + goto func_exit; + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + err = DB_SUCCESS; + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + goto func_exit; + } + + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + goto func_exit; + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + err = DB_SUCCESS; + goto func_exit; + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + ut_ad(!search_latch_locked); + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr) + || !node->asc); + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(err); +} + +/************************************************************************** +Performs a select step. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_sel_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint i_lock_mode; + sym_node_t* table_node; + sel_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(thr_get_trx(thr)); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + /* Assign a read view for the query */ + node->read_view = trx_assign_read_view( + thr_get_trx(thr)); + } else { + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + table_node = node->table_list; + + while (table_node) { + err = lock_table(0, table_node->table, + i_lock_mode, thr); + if (err != DB_SUCCESS) { + + que_thr_handle_error(thr, DB_ERROR, + NULL, 0); + return(NULL); + } + + table_node = que_node_get_next(table_node); + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor && + UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + fprintf(stderr, "SQL error %lu\n", (ulong) err); + + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + + return(NULL); + } + + return(thr); +} + +/************************************************************************** +Performs a fetch for a cursor. */ + +que_thr_t* +fetch_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + sel_assign_into_var_values(node->into_list, sel_node); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + /* SQL error detected */ + fprintf(stderr, "SQL error %lu\n", (ulong)DB_ERROR); + + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/*************************************************************** +Prints a row in a select result. */ + +que_thr_t* +row_printf_step( +/*============*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = thr->run_node; + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + fputs(" ::: ", stderr); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/******************************************************************** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. A counterpart of this function is +ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ + +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /* in: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /* in: buffer to use in field + conversions */ + ulint buf_len, /* in: buffer length */ + dict_index_t* index, /* in: index of the key value */ + byte* key_ptr, /* in: MySQL key value */ + ulint key_len, /* in: MySQL key value length */ + trx_t* trx) /* in: transaction */ +{ + byte* original_buf = buf; + byte* original_key_ptr = key_ptr; + dict_field_t* field; + dfield_t* dfield; + ulint data_offset; + ulint data_len; + ulint data_field_len; + ibool is_null; + byte* key_end; + ulint n_fields = 0; + ulint type; + + /* For documentation of the key value storage format in MySQL, see + ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + field = dict_index_get_nth_field(index, 0); + + if (dfield_get_type(dfield)->mtype == DATA_SYS) { + /* A special case: we are looking for a position in the + generated clustered index which InnoDB automatically added + to a table with no primary key: the first and the only + ordering column is ROW_ID which InnoDB stored to the key_ptr + buffer. */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + + ut_a(dict_col_get_type(field->col)->mtype + == dfield_get_type(dfield)->mtype); + + data_offset = 0; + is_null = FALSE; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + data_offset = 1; + + if (*key_ptr != 0) { + dfield_set_data(dfield, NULL, UNIV_SQL_NULL); + + is_null = TRUE; + } + } + + type = dfield_get_type(dfield)->mtype; + + /* Calculate data length and data field total length */ + + if (type == DATA_BLOB) { + /* The key field is a column prefix of a BLOB or + TEXT */ + + ut_a(field->prefix_len > 0); + + /* MySQL stores the actual data length to the first 2 + bytes after the optional SQL NULL marker byte. The + storage format is little-endian, that is, the most + significant byte at a higher address. In UTF-8, MySQL + seems to reserve field->prefix_len bytes for + storing this field in the key value buffer, even + though the actual value only takes data_len bytes + from the start. */ + + data_len = key_ptr[data_offset] + + 256 * key_ptr[data_offset + 1]; + data_field_len = data_offset + 2 + field->prefix_len; + + data_offset += 2; + + /* Now that we know the length, we store the column + value like it would be a fixed char field */ + + } else if (field->prefix_len > 0) { + /* Looks like MySQL pads unused end bytes in the + prefix with space. Therefore, also in UTF-8, it is ok + to compare with a prefix containing full prefix_len + bytes, and no need to take at most prefix_len / 3 + UTF-8 characters from the start. + If the prefix is used as the upper end of a LIKE + 'abc%' query, then MySQL pads the end with chars + 0xff. TODO: in that case does it any harm to compare + with the full prefix_len bytes. How do characters + 0xff in UTF-8 behave? */ + + data_len = field->prefix_len; + data_field_len = data_offset + data_len; + } else { + data_len = dfield_get_type(dfield)->len; + data_field_len = data_offset + data_len; + } + + if (dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR + && dfield_get_type(dfield)->mtype != DATA_INT) { + /* In a MySQL key value format, a true VARCHAR is + always preceded by 2 bytes of a length field. + dfield_get_type(dfield)->len returns the maximum + 'payload' len in bytes. That does not include the + 2 bytes that tell the actual data length. + + We added the check != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! */ + + data_len += 2; + data_field_len += 2; + } + + /* Storing may use at most data_len bytes of buf */ + + if (!is_null) { + row_mysql_store_col_in_innobase_format( + dfield, + buf, + FALSE, /* MySQL key value format col */ + key_ptr + data_offset, + data_len, + index->table->comp); + buf += data_len; + } + + key_ptr += data_field_len; + + if (key_ptr > key_end) { + /* The last field in key was not a complete key field + but a prefix of it. + + Print a warning about this! HA_READ_PREFIX_LAST does + not currently work in InnoDB with partial-field key + value prefixes. Since MySQL currently uses a padding + trick to calculate LIKE 'abc%' type queries there + should never be partial-field prefixes in searches. */ + + ut_print_timestamp(stderr); + + fputs( + " InnoDB: Warning: using a partial-field key prefix in search.\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, ". Last data field length %lu bytes,\n" + "InnoDB: key ptr now exceeds key end by %lu bytes.\n" + "InnoDB: Key value in the MySQL format:\n", + (ulong) data_field_len, + (ulong) (key_ptr - key_end)); + fflush(stderr); + ut_print_buf(stderr, original_key_ptr, key_len); + fprintf(stderr, "\n"); + + if (!is_null) { + dfield->len -= (ulint)(key_ptr - key_end); + } + } + + n_fields++; + field++; + dfield++; + } + + ut_a(buf <= original_buf + buf_len); + + /* We set the length of tuple to n_fields: we assume that the memory + area allocated for it is big enough (usually bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/****************************************************************** +Stores the row id to the prebuilt struct. */ +static +void +row_sel_store_row_id_to_prebuilt( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt */ + rec_t* index_rec, /* in: record */ + dict_index_t* index, /* in: index of the record */ + const ulint* offsets) /* in: rec_get_offsets + (index_rec, index) */ +{ + byte* data; + ulint len; + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field(index_rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); + + if (len != DATA_ROW_ID_LEN) { + fprintf(stderr, +"InnoDB: Error: Row id field is wrong length %lu in ", (ulong) len); + dict_index_name_print(stderr, prebuilt->trx, index); + fprintf(stderr, "\n" +"InnoDB: Field number %lu, record:\n", + (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID)); + rec_print_new(stderr, index_rec, offsets); + putc('\n', stderr); + ut_error; + } + + ut_memcpy(prebuilt->row_id, data, len); +} + +/****************************************************************** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */ +static +void +row_sel_field_store_in_mysql_format( +/*================================*/ + byte* dest, /* in/out: buffer where to store; NOTE that BLOBs + are not in themselves stored here: the caller must + allocate and copy the BLOB into buffer before, and pass + the pointer to the BLOB in 'data' */ + const mysql_row_templ_t* templ, /* in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, mbminlen, mbmaxlen */ + byte* data, /* in: data to store */ + ulint len) /* in: length of the data */ +{ + byte* ptr; + byte* field_end; + byte* pad_ptr; + + ut_ad(len != UNIV_SQL_NULL); + + if (templ->type == DATA_INT) { + /* Convert integer data from Innobase to a little-endian + format, sign bit restored to normal */ + + ptr = dest + len; + + for (;;) { + ptr--; + *ptr = *data; + if (ptr == dest) { + break; + } + data++; + } + + if (!templ->is_unsigned) { + dest[len - 1] = (byte) (dest[len - 1] ^ 128); + } + + ut_ad(templ->mysql_col_len == len); + } else if (templ->type == DATA_VARCHAR + || templ->type == DATA_VARMYSQL + || templ->type == DATA_BINARY) { + + field_end = dest + templ->mysql_col_len; + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len(dest, len, + templ->mysql_length_bytes); + } + + /* Copy the actual data */ + ut_memcpy(dest, data, len); + + /* Pad with trailing spaces. We pad with spaces also the + unused end of a >= 5.0.3 true VARCHAR column, just in case + MySQL expects its contents to be deterministic. */ + + pad_ptr = dest + len; + + ut_ad(templ->mbminlen <= templ->mbmaxlen); + + /* We handle UCS2 charset strings differently. */ + if (templ->mbminlen == 2) { + /* A space char is two bytes, 0x0020 in UCS2 */ + + if (len & 1) { + /* A 0x20 has been stripped from the column. + Pad it back. */ + + if (pad_ptr < field_end) { + *pad_ptr = 0x20; + pad_ptr++; + } + } + + /* Pad the rest of the string with 0x0020 */ + + while (pad_ptr < field_end) { + *pad_ptr = 0x00; + pad_ptr++; + *pad_ptr = 0x20; + pad_ptr++; + } + } else { + ut_ad(templ->mbminlen == 1); + /* space=0x20 */ + + memset(pad_ptr, 0x20, field_end - pad_ptr); + } + } else if (templ->type == DATA_BLOB) { + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, templ->mysql_col_len, data, + len); + } else if (templ->type == DATA_MYSQL) { + memcpy(dest, data, len); + + ut_a(templ->mysql_col_len >= len); + ut_a(templ->mbmaxlen >= templ->mbminlen); + + ut_a(templ->mbmaxlen > templ->mbminlen + || templ->mysql_col_len == len); + /* The following assertion would fail for old tables + containing UTF-8 ENUM columns due to Bug #9526. */ + ut_ad(!templ->mbmaxlen + || !(templ->mysql_col_len % templ->mbmaxlen)); + ut_a(len * templ->mbmaxlen >= templ->mysql_col_len); + + if (templ->mbminlen != templ->mbmaxlen) { + /* Pad with spaces. This undoes the stripping + done in row0mysql.ic, function + row_mysql_store_col_in_innobase_format(). */ + + memset(dest + len, 0x20, templ->mysql_col_len - len); + } + } else { + ut_a(templ->type == DATA_CHAR + || templ->type == DATA_FIXBINARY + /*|| templ->type == DATA_SYS_CHILD + || templ->type == DATA_SYS*/ + || templ->type == DATA_FLOAT + || templ->type == DATA_DOUBLE + || templ->type == DATA_DECIMAL); + ut_ad(templ->mysql_col_len == len); + + memcpy(dest, data, len); + } +} + +/****************************************************************** +Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. */ +static +ibool +row_sel_store_mysql_rec( +/*====================*/ + /* out: TRUE if success, FALSE if + could not allocate memory for a BLOB + (though we may also assert in that + case) */ + byte* mysql_rec, /* out: row in the MySQL format */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec, /* in: Innobase record in the index + which was described in prebuilt's + template */ + const ulint* offsets) /* in: array returned by + rec_get_offsets() */ +{ + mysql_row_templ_t* templ; + mem_heap_t* extern_field_heap = NULL; + byte* data; + ulint len; + byte* blob_buf; + int pad_char; + ulint i; + + ut_ad(prebuilt->mysql_template); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (prebuilt->blob_heap != NULL) { + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + } + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + + data = rec_get_nth_field(rec, offsets, + templ->rec_field_no, &len); + + if (rec_offs_nth_extern(offsets, templ->rec_field_no)) { + + /* Copy an externally stored field to the temporary + heap */ + + ut_a(!prebuilt->trx->has_search_latch); + + extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE); + + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + + data = btr_rec_copy_externally_stored_field(rec, + offsets, templ->rec_field_no, &len, + extern_field_heap); + + ut_a(len != UNIV_SQL_NULL); + } + + if (len != UNIV_SQL_NULL) { + if (templ->type == DATA_BLOB) { + + ut_a(prebuilt->templ_contains_blob); + + /* A heuristic test that we can allocate the + memory for a big BLOB. We have a safety margin + of 1000000 bytes. Since the test takes some + CPU time, we do not use it for small BLOBs. */ + + if (len > 2000000 + && !ut_test_malloc(len + 1000000)) { + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: could not allocate %lu + 1000000 bytes to retrieve\n" +"InnoDB: a big column. Table name ", (ulong) len); + ut_print_name(stderr, + prebuilt->trx, + prebuilt->table->name); + putc('\n', stderr); + + if (extern_field_heap) { + mem_heap_free( + extern_field_heap); + } + return(FALSE); + } + + /* Copy the BLOB data to the BLOB heap of + prebuilt */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = + mem_heap_create(len); + } + + blob_buf = mem_heap_alloc(prebuilt->blob_heap, + len); + ut_memcpy(blob_buf, data, len); + + data = blob_buf; + } + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, data, len); + + /* Cleanup */ + if (extern_field_heap) { + mem_heap_free(extern_field_heap); + extern_field_heap = NULL; + } + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ + mysql_rec[templ->mysql_null_byte_offset] &= + ~(byte) (templ->mysql_null_bit_mask); + } + } else { + /* MySQL seems to assume the field for an SQL NULL + value is set to zero or space. Not taking this into + account caused seg faults with NULL BLOB fields, and + bug number 154 in the MySQL bug database: GROUP BY + and DISTINCT could treat NULL values inequal. */ + + mysql_rec[templ->mysql_null_byte_offset] |= + (byte) (templ->mysql_null_bit_mask); + if (templ->type == DATA_VARCHAR + || templ->type == DATA_CHAR + || templ->type == DATA_BINARY + || templ->type == DATA_FIXBINARY + || templ->type == DATA_MYSQL + || templ->type == DATA_VARMYSQL) { + /* MySQL pads all non-BLOB and non-TEXT + string types with space ' ' */ + + pad_char = ' '; + } else { + pad_char = '\0'; + } + + /* Handle UCS2 strings differently. */ + if (pad_char != '\0' && templ->mbminlen == 2) { + /* There are two bytes per char, so the length + has to be an even number. */ + ut_a(!(templ->mysql_col_len & 1)); + data = mysql_rec + templ->mysql_col_offset; + len = templ->mysql_col_len; + /* Pad with 0x0020. */ + while (len >= 2) { + *data++ = 0x00; + *data++ = 0x20; + len -= 2; + } + } else { + ut_ad(!pad_char || templ->mbminlen == 1); + memset(mysql_rec + templ->mysql_col_offset, + pad_char, templ->mysql_col_len); + } + } + } + + return(TRUE); +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + dict_index_t* clust_index, /* in: clustered index */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_consistent_read(rec, mtr, clust_index, + offsets, read_view, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. */ +static +ulint +row_sel_get_clust_rec_for_mysql( +/*============================*/ + /* out: DB_SUCCESS or error code */ + row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */ + dict_index_t* sec_index,/* in: secondary index where rec resides */ + rec_t* rec, /* in: record in a non-clustered index; if + this is a locking read, then rec is not + allowed to be delete-marked, and that would + not make sense either */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + ulint** offsets,/* out: offsets returned by + rec_get_offsets(out_rec, clust_index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + trx_t* trx; + + *out_rec = NULL; + trx = thr_get_trx(thr); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec, trx); + + clust_index = dict_table_get_first_index(sec_index->table); + + btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + + prebuilt->clust_pcur->trx_if_known = trx; + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(prebuilt->clust_pcur) + < dict_index_get_n_unique(clust_index)) { + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + if (!rec_get_deleted_flag(rec, sec_index->table->comp) + || prebuilt->select_lock_type != LOCK_NONE) { + ut_print_timestamp(stderr); + fputs(" InnoDB: error clustered record" + " for sec rec not found\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, sec_index); + fputs("\n" + "InnoDB: sec index record ", stderr); + rec_print(stderr, rec, sec_index); + fputs("\n" + "InnoDB: clust index record ", stderr); + rec_print(stderr, clust_rec, clust_index); + putc('\n', stderr); + trx_print(stderr, trx); + + fputs("\n" +"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); + } + + clust_rec = NULL; + + goto func_exit; + } + + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + ULINT_UNDEFINED, offset_heap); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; we are searching + the clust rec with a unique condition, hence + we set a LOCK_REC_NOT_GAP type lock */ + + err = lock_clust_rec_read_check_and_lock(0, clust_rec, + clust_index, *offsets, + prebuilt->select_lock_type, + LOCK_REC_NOT_GAP, thr); + if (err != DB_SUCCESS) { + + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + /* If the isolation level allows reading of uncommitted data, + then we never look for an earlier version */ + + if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && !lock_clust_rec_cons_read_sees(clust_rec, clust_index, + *offsets, trx->read_view)) { + + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, clust_rec, + offsets, offset_heap, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + clust_rec = old_vers; + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if (clust_rec && (old_vers + || rec_get_deleted_flag(rec, sec_index->table->comp)) + && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index, + clust_rec, clust_index)) { + clust_rec = NULL; + } else { +#ifdef UNIV_SEARCH_DEBUG + ut_a(clust_rec == NULL || + row_sel_sec_rec_is_for_clust_rec(rec, sec_index, + clust_rec, clust_index)); +#endif + } + } + +func_exit: + *out_rec = clust_rec; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We may use the cursor in update: store its position */ + + btr_pcur_store_position(prebuilt->clust_pcur, mtr); + } + + err = DB_SUCCESS; +err_exit: + return(err); +} + +/************************************************************************ +Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on may have been deleted. +Then we may have to move the cursor one step up or down. */ +static +ibool +sel_restore_position_for_mysql( +/*===========================*/ + /* out: TRUE if we may need to + process the record the cursor is + now positioned on (i.e. we should + not go to the next record yet) */ + ulint latch_mode, /* in: latch mode wished in + restoration */ + btr_pcur_t* pcur, /* in: cursor whose position + has been stored */ + ibool moves_up, /* in: TRUE if the cursor moves up + in the index */ + mtr_t* mtr) /* in: mtr; CAUTION: may commit + mtr temporarily! */ +{ + ibool success; + ulint relative_position; + + relative_position = pcur->rel_pos; + + success = btr_pcur_restore_position(latch_mode, pcur, mtr); + + if (relative_position == BTR_PCUR_ON) { + if (success) { + return(FALSE); + } + + if (moves_up) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); + } + + if (relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) { + + if (moves_up) { + return(TRUE); + } + + if (btr_pcur_is_on_user_rec(pcur, mtr)) { + btr_pcur_move_to_prev(pcur, mtr); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE); + + if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); +} + +/************************************************************************ +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_pop_cached_row_for_mysql( +/*=============================*/ + byte* buf, /* in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct */ +{ + ulint i; + mysql_row_templ_t* templ; + byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + + if (prebuilt->keep_other_fields_on_keyread) + { + /* Copy cache record field by field, don't touch fields that + are not covered by current key */ + cached_rec = + prebuilt->fetch_cache[prebuilt->fetch_cache_first]; + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + ut_memcpy( + buf + templ->mysql_col_offset, + cached_rec + templ->mysql_col_offset, + templ->mysql_col_len); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) + { + buf[templ->mysql_null_byte_offset] ^= + (buf[templ->mysql_null_byte_offset] ^ + cached_rec[templ->mysql_null_byte_offset]) & + (byte)templ->mysql_null_bit_mask; + } + } + } + else + { + ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_row_len); + } + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/************************************************************************ +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_push_cache_row_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec, /* in: record to push */ + const ulint* offsets) /* in: rec_get_offsets() */ +{ + byte* buf; + ulint i; + + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_a(!prebuilt->templ_contains_blob); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + + /* A user has reported memory corruption in these + buffers in Linux. Put magic numbers there to help + to track a possible bug. */ + + buf = mem_alloc(prebuilt->mysql_row_len + 8); + + prebuilt->fetch_cache[i] = buf + 4; + + mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N); + mach_write_to_4(buf + 4 + prebuilt->mysql_row_len, + ROW_PREBUILT_FETCH_MAGIC_N); + } + } + + ut_ad(prebuilt->fetch_cache_first == 0); + + if (!row_sel_store_mysql_rec( + prebuilt->fetch_cache[prebuilt->n_fetch_cached], + prebuilt, rec, offsets)) { + ut_error; + } + + prebuilt->n_fetch_cached++; +} + +/************************************************************************* +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). We assume that the search +mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx, +btr search latch has been locked in S-mode. */ +static +ulint +row_sel_try_search_shortcut_for_mysql( +/*==================================*/ + /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ + rec_t** out_rec,/* out: record if found */ + row_prebuilt_t* prebuilt,/* in: prebuilt struct */ + ulint** offsets,/* in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ + mtr_t* mtr) /* in: started mtr */ +{ + dict_index_t* index = prebuilt->index; + dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + rec_t* rec; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(!prebuilt->templ_contains_blob); + + btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, pcur, +#ifndef UNIV_SEARCH_DEBUG + RW_S_LATCH, +#else + 0, +#endif + mtr); + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { + + return(SEL_RETRY); + } + + if (rec_get_deleted_flag(rec, index->table->comp)) { + + return(SEL_EXHAUSTED); + } + + *out_rec = rec; + + return(SEL_FOUND); +} + +/************************************************************************ +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! */ + +ulint +row_search_for_mysql( +/*=================*/ + /* out: DB_SUCCESS, + DB_RECORD_NOT_FOUND, + DB_END_OF_INDEX, DB_DEADLOCK, + DB_LOCK_TABLE_FULL, DB_CORRUPTION, + or DB_TOO_BIG_RECORD */ + byte* buf, /* in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /* in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /* in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /* in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +{ + dict_index_t* index = prebuilt->index; + dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + rec_t* rec; + rec_t* index_rec; + rec_t* clust_rec; + rec_t* old_vers; + ulint err = DB_SUCCESS; + ibool moved; + ibool cons_read_requires_clust_rec; + ibool was_lock_wait; + ulint shortcut; + ibool unique_search = FALSE; + ibool unique_search_from_clust_index = FALSE; + ibool mtr_has_extra_clust_latch = FALSE; + ibool moves_up = FALSE; + ibool set_also_gap_locks = TRUE; + /* if the query is a plain + locking SELECT, and the isolation + level is <= TRX_ISO_READ_COMMITTED, + then this is set to FALSE */ + ibool success; + ibool comp; + ulint cnt = 0; + ulint next_offs; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(index && pcur && search_tuple); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" +"InnoDB: MySQL is trying to use a table handle but the .ibd file for\n" +"InnoDB: table %s does not exist.\n" +"InnoDB: Have you deleted the .ibd file from the database directory under\n" +"InnoDB: the MySQL datadir, or have you used DISCARD TABLESPACE?\n" +"InnoDB: Look from\n" +"http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" +"InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption((byte*)prebuilt); + + ut_error; + } + + if (trx->n_mysql_tables_in_use == 0 + && prebuilt->select_lock_type == LOCK_NONE) { + /* Note that if MySQL uses an InnoDB temp table that it + created inside LOCK TABLES, then n_mysql_tables_in_use can + be zero; in that case select_lock_type is set to LOCK_X in + ::start_stmt. */ + + fputs( +"InnoDB: Error: MySQL is trying to perform a SELECT\n" +"InnoDB: but it has not locked any tables in ::external_lock()!\n", + stderr); + trx_print(stderr, trx); + fputc('\n', stderr); + } + +/* fprintf(stderr, "Match mode %lu\n search tuple ", (ulong) match_mode); + dtuple_print(search_tuple); + + fprintf(stderr, "N tables locked %lu\n", trx->mysql_n_tables_locked); +*/ + /*-------------------------------------------------------------*/ + /* PHASE 0: Release a possible s-latch we are holding on the + adaptive hash index latch if there is someone waiting behind */ + + if (trx->has_search_latch + && btr_search_latch.writer != RW_LOCK_NOT_LOCKED) { + + /* There is an x-latch request on the adaptive hash index: + release the s-latch to reduce starvation and wait for + BTR_SEA_TIMEOUT rounds before trying to keep it again over + calls from MySQL */ + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + } + + /*-------------------------------------------------------------*/ + /* PHASE 1: Try to pop the row from the prefetch cache */ + + if (direction == 0) { + trx->op_info = "starting index read"; + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + trx->op_info = "fetching rows"; + + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (direction != prebuilt->fetch_direction) { + if (prebuilt->n_fetch_cached > 0) { + ut_error; + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + + srv_n_rows_read++; + err = DB_SUCCESS; + goto func_exit; + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { + + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + /* In a search where at most one record in the index may match, we + can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete- + marked matching record. + + Note that in a unique secondary index there may be different delete- + marked versions of a record where only the primary key values differ: + thus in a secondary index we must use next-key locks when locking + delete-marked records. */ + + if (match_mode == ROW_SEL_EXACT + && index->type & DICT_UNIQUE + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index) + && (index->type & DICT_CLUSTERED + || !dtuple_contains_null(search_tuple))) { + + /* Note above that a UNIQUE secondary index can contain many + rows with the same key value if one of the columns is the SQL + null. A clustered index under MySQL can never contain null + columns because we demand that all the columns in primary key + are non-null. */ + + unique_search = TRUE; + + /* Even if the condition is unique, MySQL seems to try to + retrieve also a second row if a primary key contains more than + 1 column. Return immediately if this is not a HANDLER + command. */ + + if (direction != 0 && !prebuilt->used_in_HANDLER) { + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + } + + mtr_start(&mtr); + + /*-------------------------------------------------------------*/ + /* PHASE 2: Try fast adaptive hash index search if possible */ + + /* Next test if this is the special case where we can use the fast + adaptive hash index to try the search. Since we must release the + search system latch when we retrieve an externally stored field, we + cannot use the adaptive hash index in a search in the case the row + may be long and there may be externally stored fields */ + + if (unique_search + && index->type & DICT_CLUSTERED + && direction == 0 + && !prebuilt->templ_contains_blob + && !prebuilt->used_in_HANDLER + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { + + mode = PAGE_CUR_GE; + + unique_search_from_clust_index = TRUE; + + if (trx->mysql_n_tables_locked == 0 + && prebuilt->select_lock_type == LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && trx->read_view) { + + /* This is a SELECT query done as a consistent read, + and the read view has already been allocated: + let us try a search shortcut through the hash + index. + NOTE that we must also test that + mysql_n_tables_locked == 0, because this might + also be INSERT INTO ... SELECT ... or + CREATE TABLE ... SELECT ... . Our algorithm is + NOT prepared to inserts interleaved with the SELECT, + and if we try that, we can deadlock on the adaptive + hash index semaphore! */ + +#ifndef UNIV_SEARCH_DEBUG + if (!trx->has_search_latch) { + rw_lock_s_lock(&btr_search_latch); + trx->has_search_latch = TRUE; + } +#endif + shortcut = row_sel_try_search_shortcut_for_mysql(&rec, + prebuilt, &offsets, &heap, &mtr); + if (shortcut == SEL_FOUND) { +#ifdef UNIV_SEARCH_DEBUG + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); +#endif + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { + err = DB_TOO_BIG_RECORD; + + /* We let the main loop to do the + error handling */ + goto shortcut_fails_too_big_rec; + } + + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" shortcut\n", stderr); */ + + srv_n_rows_read++; + + if (trx->search_latch_timeout > 0 + && trx->has_search_latch) { + + trx->search_latch_timeout--; + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + /* NOTE that we do NOT store the cursor + position */ + err = DB_SUCCESS; + goto func_exit; + + } else if (shortcut == SEL_EXHAUSTED) { + + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" record not found 2\n", stderr); */ + + if (trx->search_latch_timeout > 0 + && trx->has_search_latch) { + + trx->search_latch_timeout--; + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + /* NOTE that we do NOT store the cursor + position */ + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } +shortcut_fails_too_big_rec: + mtr_commit(&mtr); + mtr_start(&mtr); + } + } + + /*-------------------------------------------------------------*/ + /* PHASE 3: Open or restore index cursor position */ + + if (trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + trx_start_if_not_started(trx); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE + && trx->mysql_query_str) { + + /* Scan the MySQL query string; check if SELECT is the first + word there */ + + dict_accept(*trx->mysql_query_str, "SELECT", &success); + + if (success) { + /* It is a plain locking SELECT and the isolation + level is low: do not lock gaps */ + + set_also_gap_locks = FALSE; + } + } + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (direction == 0) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { + moves_up = TRUE; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = TRUE; + } + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + clust_index = dict_table_get_first_index(index->table); + + if (direction != 0) { + moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (!moved) { + goto next_rec; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + + btr_pcur_open_with_no_init(index, search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, 0, &mtr); + + pcur->trx_if_known = trx; + } else { + if (mode == PAGE_CUR_G) { + btr_pcur_open_at_index_side(TRUE, index, + BTR_SEARCH_LEAF, pcur, FALSE, &mtr); + } else if (mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side(FALSE, index, + BTR_SEARCH_LEAF, pcur, FALSE, &mtr); + } + } + + if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + + if (trx->read_view == NULL + && prebuilt->select_lock_type == LOCK_NONE) { + + fputs( +"InnoDB: Error: MySQL is trying to perform a consistent read\n" +"InnoDB: but the read view is not assigned!\n", stderr); + trx_print(stderr, trx); + fputc('\n', stderr); + ut_a(0); + } + } else if (prebuilt->select_lock_type == LOCK_NONE) { + /* This is a consistent read */ + /* Assign a read view for the query */ + + trx_assign_read_view(trx); + prebuilt->sql_stat_start = FALSE; + } else { + if (prebuilt->select_lock_type == LOCK_S) { + err = lock_table(0, index->table, LOCK_IS, thr); + } else { + err = lock_table(0, index->table, LOCK_IX, thr); + } + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + prebuilt->sql_stat_start = FALSE; + } + +rec_loop: + /*-------------------------------------------------------------*/ + /* PHASE 4: Look for matching records in a loop */ + + rec = btr_pcur_get_rec(pcur); + comp = index->table->comp; + ut_ad(comp == page_is_comp(buf_frame_align(rec))); +/* + fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt, + buf_frame_get_page_no(buf_frame_align(rec))); + rec_print(rec); +*/ + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + if (prebuilt->select_lock_type != LOCK_NONE + && set_also_gap_locks) { + + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used, + we do not lock gaps. Supremum record is really + a gap and therefore we do not set locks there. */ + + if (!srv_locks_unsafe_for_binlog) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(rec, index, offsets, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + } + /* A page supremum record cannot be in the result set: skip + it now that we have placed a possible lock on it */ + + goto next_rec; + } + + /*-------------------------------------------------------------*/ + /* Do sanity checks in case our cursor has bumped into page + corruption */ + + next_offs = rec_get_next_offs(rec, comp); + + if (next_offs >= UNIV_PAGE_SIZE + || next_offs < + (ulint) (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)) { + + if (srv_force_recovery == 0 || moves_up == FALSE) { + ut_print_timestamp(stderr); + buf_page_print(buf_frame_align(rec)); + fprintf(stderr, +"\nInnoDB: rec address %p, first buffer frame %p\n" +"InnoDB: buffer pool high end %p, buf block fix count %lu\n", + rec, buf_pool->frame_zero, + buf_pool->high_end, + (ulong)buf_block_align(rec)->buf_fix_count); + fprintf(stderr, +"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" +"InnoDB: ", + (ulong) (rec - buf_frame_align(rec)), + (ulong) next_offs, + (ulong) buf_frame_get_page_no(rec)); + dict_index_name_print(stderr, trx, index); + fputs(". Run CHECK TABLE. You may need to\n" +"InnoDB: restore from a backup, or dump + drop + reimport the table.\n", + stderr); + + err = DB_CORRUPTION; + + goto lock_wait_or_error; + } else { + /* The user may be dumping a corrupt table. Jump + over the corruption to recover as much as possible. */ + + fprintf(stderr, +"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" +"InnoDB: ", + (ulong) (rec - buf_frame_align(rec)), + (ulong) next_offs, + (ulong) buf_frame_get_page_no(rec)); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the rest of the page.\n", + stderr); + + btr_pcur_move_to_last_on_page(pcur, &mtr); + + goto next_rec; + } + } + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (srv_force_recovery > 0) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { + fprintf(stderr, +"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" +"InnoDB: ", + (ulong) (rec - buf_frame_align(rec)), + (ulong) next_offs, + (ulong) buf_frame_get_page_no(rec)); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the record.\n", + stderr); + + goto next_rec; + } + } + + /*-------------------------------------------------------------*/ + + /* Note that we cannot trust the up_match value in the cursor at this + place because we can arrive here after moving the cursor! Thus + we have to recompare rec and search_tuple to determine if they + match enough. */ + + if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + /* fputs("Comparing rec and search tuple\n", stderr); */ + + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { + + if (prebuilt->select_lock_type != LOCK_NONE + && set_also_gap_locks) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set */ + + if (srv_locks_unsafe_for_binlog == FALSE) { + + err = sel_set_rec_lock(rec, index, + offsets, + prebuilt->select_lock_type, + LOCK_GAP, thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + } + + btr_pcur_store_position(pcur, &mtr); + + err = DB_RECORD_NOT_FOUND; + /* ut_print_name(stderr, index->name); + fputs(" record not found 3\n", stderr); */ + + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { + + if (prebuilt->select_lock_type != LOCK_NONE + && set_also_gap_locks) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set */ + + if (srv_locks_unsafe_for_binlog == FALSE) { + + err = sel_set_rec_lock(rec, index, + offsets, + prebuilt->select_lock_type, + LOCK_GAP, thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + } + + btr_pcur_store_position(pcur, &mtr); + + err = DB_RECORD_NOT_FOUND; + /* ut_print_name(stderr, index->name); + fputs(" record not found 4\n", stderr); */ + + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + cons_read_requires_clust_rec = FALSE; + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; note that delete + marked records are a special case in a unique search. If there + is a non-delete marked record, then it is enough to lock its + existence with LOCK_REC_NOT_GAP. */ + + ulint lock_type; + + if (!set_also_gap_locks + || (unique_search && !rec_get_deleted_flag(rec, comp))) { + lock_type = LOCK_REC_NOT_GAP; + } else { + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + } + + /* If we are doing a 'greater or equal than a primary key + value' search from a clustered index, and we find a record + that has that exact primary key value, then there is no need + to lock the gap before the record, because no insert in the + gap can be in our search range. That is, no phantom row can + appear that way. + + An example: if col1 is the primary key, the search is WHERE + col1 >= 100, and we find a record where col1 = 100, then no + need to lock the gap before that record. */ + + if (index == clust_index + && mode == PAGE_CUR_GE + && direction == 0 + && dtuple_get_n_fields_cmp(search_tuple) + == dict_index_get_n_unique(index) + && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) { + + lock_type = LOCK_REC_NOT_GAP; + } + + err = sel_set_rec_lock(rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + + /* Do nothing: we let a non-locking SELECT read the + latest version of the record */ + + } else if (index == clust_index) { + + /* Fetch a previous version of the row if the current + one is not visible in the snapshot; if we have a very + high force recovery level set, we try to avoid crashes + by skipping this lookup */ + + if (srv_force_recovery < 5 + && !lock_clust_rec_cons_read_sees(rec, index, + offsets, trx->read_view)) { + + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, rec, + &offsets, &heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, + trx->read_view)) { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + cons_read_requires_clust_rec = TRUE; + } + } + + if (rec_get_deleted_flag(rec, comp) + && !cons_read_requires_clust_rec) { + + /* The record is delete-marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + goto next_rec; + } + + /* Get the clustered index record if needed and if we did + not do the search using the clustered index */ + + index_rec = rec; + + /* Before and after the following "if" block, "offsets" will be + related to "rec", which may be in "index", a secondary index or + the clustered index ("clust_index"). However, after this "if" block, + "rec" may be pointing to "clust_rec" of "clust_index". */ + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (index != clust_index && (cons_read_requires_clust_rec + || prebuilt->need_to_access_clustered)) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_has_extra_clust_latch = TRUE; + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, + &offsets, &heap, &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec, comp)) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (prebuilt->need_to_access_clustered) { + rec = clust_rec; + ut_ad(rec_offs_validate(rec, clust_index, offsets)); + } else { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + } + } + + /* We found a qualifying row */ + ut_ad(rec_offs_validate(rec, + rec == clust_rec ? clust_index : index, + offsets)); + + if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->templ_contains_blob + && !prebuilt->clust_index_was_generated + && !prebuilt->used_in_HANDLER + && prebuilt->template_type + != ROW_MYSQL_DUMMY_TEMPLATE) { + + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE. + Since we keep space in prebuilt only for the BLOBs of + a single row, we cannot cache rows in the case there + are BLOBs in the fields to be fetched. In HANDLER we do + not cache rows because there the cursor is a scrollable + cursor. */ + + row_sel_push_cache_row_for_mysql(prebuilt, rec, offsets); + + if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { + + goto got_row; + } + + goto next_rec; + } else { + if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) { + memcpy(buf + 4, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); + } else { + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { + err = DB_TOO_BIG_RECORD; + + goto lock_wait_or_error; + } + } + + if (prebuilt->clust_index_was_generated) { + if (rec != index_rec) { + offsets = rec_get_offsets( + index_rec, index, offsets, + ULINT_UNDEFINED, &heap); + } + row_sel_store_row_id_to_prebuilt(prebuilt, index_rec, + index, offsets); + } + } +got_row: + /* We have an optimization to save CPU time: if this is a consistent + read on a unique condition on the clustered index, then we do not + store the pcur position, because any fetch next or prev will anyway + return 'end of file'. An exception is the MySQL HANDLER command + where the user can move the cursor with PREV or NEXT even after + a unique search. */ + + if (!unique_search_from_clust_index + || prebuilt->select_lock_type == LOCK_X + || prebuilt->used_in_HANDLER) { + + /* Inside an update always store the cursor position */ + + btr_pcur_store_position(pcur, &mtr); + } + + err = DB_SUCCESS; + + goto normal_return; + +next_rec: + /*-------------------------------------------------------------*/ + /* PHASE 5: Move the cursor to the next index record */ + + if (mtr_has_extra_clust_latch) { + /* We must commit mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + mtr_start(&mtr); + moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (moved) { + cnt++; + + goto rec_loop; + } + } + + if (moves_up) { + moved = btr_pcur_move_to_next(pcur, &mtr); + } else { + moved = btr_pcur_move_to_prev(pcur, &mtr); + } + + if (!moved) { + btr_pcur_store_position(pcur, &mtr); + + if (match_mode != 0) { + err = DB_RECORD_NOT_FOUND; + } else { + err = DB_END_OF_INDEX; + } + + goto normal_return; + } + + cnt++; + + goto rec_loop; + +lock_wait_or_error: + /*-------------------------------------------------------------*/ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + trx->error_state = err; + + /* The following is a patch for MySQL */ + + que_thr_stop_for_mysql(thr); + + thr->lock_state= QUE_THR_LOCK_ROW; + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + mtr_start(&mtr); + + sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + mode = pcur->search_mode; + + goto rec_loop; + } + +/* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ + goto func_exit; + +normal_return: + /*-------------------------------------------------------------*/ + que_thr_stop_for_mysql_no_error(thr, trx); + + mtr_commit(&mtr); + + if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + err = DB_SUCCESS; + } + +/* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ + if (err == DB_SUCCESS) { + srv_n_rows_read++; + } + +func_exit: + trx->op_info = ""; + if (heap) { + mem_heap_free(heap); + } + return(err); +} + +/*********************************************************************** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. */ + +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + /* out: TRUE if storing or retrieving + from the query cache is permitted */ + trx_t* trx, /* in: transaction object */ + const char* norm_name) /* in: concatenation of database name, + '/' char, table name */ +{ + dict_table_t* table; + ibool ret = FALSE; + + table = dict_table_get(norm_name, trx); + + if (table == NULL) { + + return(FALSE); + } + + mutex_enter(&kernel_mutex); + + /* Start the transaction if it is not started yet */ + + trx_start_if_not_started_low(trx); + + /* If there are locks on the table or some trx has invalidated the + cache up to our trx id, then ret = FALSE. + We do not check what type locks there are on the table, though only + IX type locks actually would require ret = FALSE. */ + + if (UT_LIST_GET_LEN(table->locks) == 0 + && ut_dulint_cmp(trx->id, table->query_cache_inv_trx_id) >= 0) { + + ret = TRUE; + + /* If the isolation level is high, assign a read view for the + transaction if it does not yet have one */ + + if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ + && !trx->read_view) { + + trx->read_view = read_view_open_now(trx, + trx->read_view_heap); + } + } + + mutex_exit(&kernel_mutex); + + return(ret); +} diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.c new file mode 100644 index 00000000000..9dc860d70b1 --- /dev/null +++ b/storage/innobase/row/row0uins.c @@ -0,0 +1,308 @@ +/****************************************************** +Fresh insert undo + +(c) 1996 Innobase Oy + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" + +#ifdef UNIV_NONINL +#include "row0uins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" + +/******************************************************************* +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. */ +static +ulint +row_undo_ins_remove_clust_rec( +/*==========================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node) /* in: undo node */ +{ + btr_cur_t* btr_cur; + ibool success; + ulint err; + ulint n_tries = 0; + mtr_t mtr; + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), + &mtr); + ut_a(success); + + if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + + /* Drop the index tree associated with the row in + SYS_INDEXES table: */ + + dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); + + mtr_commit(&mtr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, + &(node->pcur), &mtr); + ut_a(success); + } + + btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); + + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (success) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_TREE, + &(node->pcur), &mtr); + ut_a(success); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + trx_undo_rec_release(node->trx, node->undo_no); + + return(err); +} + +/******************************************************************* +Removes a secondary index entry if found. */ +static +ulint +row_undo_ins_remove_sec_low( +/*========================*/ + /* out: DB_SUCCESS, DB_FAIL, or + DB_OUT_OF_FILE_SPACE */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry to remove */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool found; + ibool success; + ulint err; + mtr_t mtr; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/******************************************************************* +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. */ +static +ulint +row_undo_ins_remove_sec( +/*====================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry to insert */ +{ + ulint err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/*************************************************************** +Parses the row reference and other info in a fresh insert undo record. */ +static +void +row_undo_ins_parse_undo_rec( +/*========================*/ + undo_node_t* node) /* in: row undo node */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + ulint type; + ulint dummy; + ibool dummy_extern; + + ut_ad(node); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, + &dummy_extern, &undo_no, &table_id); + ut_ad(type == TRX_UNDO_INSERT_REC); + node->rec_type = type; + + node->table = dict_table_get_on_id(table_id, node->trx); + + if (node->table == NULL) { + + return; + } + + if (node->table->ibd_file_missing) { + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); +} + +/*************************************************************** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. */ + +ulint +row_undo_ins( +/*=========*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node) /* in: row undo node */ +{ + dtuple_t* entry; + ibool found; + ulint err; + + ut_ad(node); + ut_ad(node->state == UNDO_NODE_INSERT); + + row_undo_ins_parse_undo_rec(node); + + if (node->table == NULL) { + found = FALSE; + } else { + found = row_undo_search_clust_to_pcur(node); + } + + if (!found) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + while (node->index != NULL) { + entry = row_build_index_entry(node->row, node->index, + node->heap); + err = row_undo_ins_remove_sec(node->index, entry); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + err = row_undo_ins_remove_clust_rec(node); + + return(err); +} diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c new file mode 100644 index 00000000000..1cade0f304f --- /dev/null +++ b/storage/innobase/row/row0umod.c @@ -0,0 +1,773 @@ +/****************************************************** +Undo modify of a row + +(c) 1997 Innobase Oy + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#include "row0umod.h" + +#ifdef UNIV_NONINL +#include "row0umod.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "log0log.h" + +/* Considerations on undoing a modify operation. +(1) Undoing a delete marking: all index records should be found. Some of +them may have delete mark already FALSE, if the delete mark operation was +stopped underway, or if the undo operation ended prematurely because of a +system crash. +(2) Undoing an update of a delete unmarked record: the newer version of +an updated secondary index entry should be removed if no prior version +of the clustered index record requires its existence. Otherwise, it should +be delete marked. +(3) Undoing an update of a delete marked record. In this kind of update a +delete marked clustered index record was delete unmarked and possibly also +some of its fields were changed. Now, it is possible that the delete marked +version has become obsolete at the time the undo is started. */ + +/*************************************************************** +Checks if also the previous version of the clustered index record was +modified or inserted by the same transaction, and its undo number is such +that it should be undone in the same rollback. */ +UNIV_INLINE +ibool +row_undo_mod_undo_also_prev_vers( +/*=============================*/ + /* out: TRUE if also previous modify or + insert of this row should be undone */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dulint* undo_no)/* out: the undo number */ +{ + trx_undo_rec_t* undo_rec; + ibool ret; + trx_t* trx; + + UT_NOT_USED(thr); + + trx = node->trx; + + if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { + + return(FALSE); + } + + undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); + + *undo_no = trx_undo_rec_get_undo_no(undo_rec); + + if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) { + ret = TRUE; + } else { + ret = FALSE; + } + + return(ret); +} + +/*************************************************************** +Undoes a modify in a clustered index record. */ +static +ulint +row_undo_mod_clust_low( +/*===================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code: + we may run out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr, /* in: mtr */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + big_rec_t* dummy_big_rec; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + ut_ad(success); + + if (mode == BTR_MODIFY_LEAF) { + + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, &dummy_big_rec, node->update, + node->cmpl_info, thr, mtr); + } + + return(err); +} + +/*************************************************************** +Removes a clustered index record after undo if possible. */ +static +ulint +row_undo_mod_remove_clust_low( +/*==========================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code: + we may run out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr __attribute__((unused)), /* in: query thread */ + mtr_t* mtr, /* in: mtr */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + if (!success) { + + return(DB_SUCCESS); + } + + /* Find out if we can remove the whole clustered index record */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC + && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { + + /* Ok, we can remove */ + } else { + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* Note that since this operation is analogous to purge, + we can free also inherited externally stored fields: + hence the last FALSE in the call below */ + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, FALSE, mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + + return(err); +} + +/*************************************************************** +Undoes a modify in a clustered index record. Sets also the node state for the +next round of undo. */ +static +ulint +row_undo_mod_clust( +/*===============*/ + /* out: DB_SUCCESS or error code: we may run + out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + btr_pcur_t* pcur; + mtr_t mtr; + ulint err; + ibool success; + ibool more_vers; + dulint new_undo_no; + + ut_ad(node && thr); + + /* Check if also the previous version of the clustered index record + should be undone in this same rollback operation */ + + more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no); + + pcur = &(node->pcur); + + mtr_start(&mtr); + + /* Try optimistic processing of the record, keeping changes within + the index page */ + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF); + + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a pessimistic + descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_LEAF); + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a + pessimistic descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + } + + node->state = UNDO_NODE_FETCH_NEXT; + + trx_undo_rec_release(node->trx, node->undo_no); + + if (more_vers && err == DB_SUCCESS) { + + /* Reserve the undo log record to the prior version after + committing &mtr: this is necessary to comply with the latching + order, as &mtr may contain the fsp latch which is lower in + the latch hierarchy than trx->undo_mutex. */ + + success = trx_undo_rec_reserve(node->trx, new_undo_no); + + if (success) { + node->state = UNDO_NODE_PREV_VERS; + } + } + + return(err); +} + +/*************************************************************** +Delete marks or removes a secondary index entry if found. */ +static +ulint +row_undo_mod_del_mark_or_remove_sec_low( +/*====================================*/ + /* out: DB_SUCCESS, DB_FAIL, or + DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode) /* in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + ibool found; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has; + ulint err; + mtr_t mtr; + mtr_t mtr_vers; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + /* We should remove the index record if no prior version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should delete mark the record. */ + + mtr_start(&mtr_vers); + + success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur), + &mtr_vers); + ut_a(success); + + old_has = row_vers_old_has_index_entry(FALSE, + btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + if (old_has) { + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, &mtr); + ut_ad(err == DB_SUCCESS); + } else { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + TRUE, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/*************************************************************** +Delete marks or removes a secondary index entry if found. +NOTE that if we updated the fields of a delete-marked secondary index record +so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot +return to the original values because we do not know them. But this should +not cause problems because in row0sel.c, in queries we always retrieve the +clustered index record or an earlier version of it, if the secondary index +record through which we do the search is delete-marked. */ +static +ulint +row_undo_mod_del_mark_or_remove_sec( +/*================================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + ulint err; + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_LEAF); + if (err == DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_TREE); + return(err); +} + +/*************************************************************** +Delete unmarks a secondary index entry which must be found. It might not be +delete-marked at the moment, but it does not harm to unmark it anyway. We also +need to update the fields of the secondary index record if we updated its +fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. */ +static +ulint +row_undo_mod_del_unmark_sec_and_undo_update( +/*========================================*/ + /* out: DB_FAIL or DB_SUCCESS or + DB_OUT_OF_FILE_SPACE */ + ulint mode, /* in: search mode: BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + upd_t* update; + ulint err = DB_SUCCESS; + ibool found; + big_rec_t* dummy_big_rec; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + if (!found) { + fputs("InnoDB: error in sec index entry del undo in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); + putc('\n', stderr); + trx_print(stderr, trx); + fputs("\n" +"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); + } else { + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); + ut_a(err == DB_SUCCESS); + heap = mem_heap_create(100); + + update = row_upd_build_sec_rec_difference_binary(index, entry, + btr_cur_get_rec(btr_cur), trx, heap); + if (upd_get_n_fields(update) == 0) { + + /* Do nothing */ + + } else if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping + changes within the page */ + + err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG + | BTR_NO_LOCKING_FLAG, + btr_cur, update, 0, thr, &mtr); + if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG + | BTR_NO_LOCKING_FLAG, + btr_cur, &dummy_big_rec, + update, 0, thr, &mtr); + } + + mem_heap_free(heap); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is UPD_DEL. */ +static +ulint +row_undo_mod_upd_del_sec( +/*=====================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, index, heap); + + err = row_undo_mod_del_mark_or_remove_sec(node, thr, index, + entry); + if (err != DB_SUCCESS) { + + mem_heap_free(heap); + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is DEL_MARK. */ +static +ulint +row_undo_mod_del_mark_sec( +/*======================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, index, heap); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, + thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, + thr, index, entry); + } + + if (err != DB_SUCCESS) { + + mem_heap_free(heap); + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */ +static +ulint +row_undo_mod_upd_exist_sec( +/*=======================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + /* No change in secondary indexes */ + + return(DB_SUCCESS); + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field_binary(node->row, node->index, + node->update)) { + + /* Build the newest version of the index entry */ + entry = row_build_index_entry(node->row, index, heap); + + /* NOTE that if we updated the fields of a + delete-marked secondary index record so that + alphabetically they stayed the same, e.g., + 'abc' -> 'aBc', we cannot return to the original + values because we do not know them. But this should + not cause problems because in row0sel.c, in queries + we always retrieve the clustered index record or an + earlier version of it, if the secondary index record + through which we do the search is delete-marked. */ + + err = row_undo_mod_del_mark_or_remove_sec(node, thr, + index, entry); + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + + /* We may have to update the delete mark in the + secondary index record of the previous version of + the row. We also need to update the fields of + the secondary index record if we updated its fields + but alphabetically they stayed the same, e.g., + 'abc' -> 'aBc'. */ + + row_upd_index_replace_new_col_vals(entry, index, + node->update, NULL); + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, + thr, index, entry); + if (err == DB_FAIL) { + err = + row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, + thr, index, entry); + } + + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Parses the row reference and other info in a modify undo log record. */ +static +void +row_undo_mod_parse_undo_rec( +/*========================*/ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + ibool dummy_extern; + trx_t* trx; + + ut_ad(node && thr); + trx = thr_get_trx(thr); + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + node->rec_type = type; + + node->table = dict_table_get_on_id(table_id, trx); + + /* TODO: other fixes associated with DROP TABLE + rollback in the + same table by another user */ + + if (node->table == NULL) { + /* Table was dropped */ + return; + } + + if (node->table->ibd_file_missing) { + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + node->new_roll_ptr = roll_ptr; + node->new_trx_id = trx_id; + node->cmpl_info = cmpl_info; +} + +/*************************************************************** +Undoes a modify operation on a row of a table. */ + +ulint +row_undo_mod( +/*=========*/ + /* out: DB_SUCCESS or error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + ibool found; + ulint err; + + ut_ad(node && thr); + ut_ad(node->state == UNDO_NODE_MODIFY); + + row_undo_mod_parse_undo_rec(node, thr); + + if (node->table == NULL) { + found = FALSE; + } else { + found = row_undo_search_clust_to_pcur(node); + } + + if (!found) { + /* It is already undone, or will be undone by another query + thread, or table was dropped */ + + trx_undo_rec_release(node->trx, node->undo_no); + node->state = UNDO_NODE_FETCH_NEXT; + + return(DB_SUCCESS); + } + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + err = row_undo_mod_upd_exist_sec(node, thr); + + } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + + err = row_undo_mod_del_mark_sec(node, thr); + } else { + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + err = row_undo_mod_upd_del_sec(node, thr); + } + + if (err != DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_clust(node, thr); + + return(err); +} diff --git a/storage/innobase/row/row0undo.c b/storage/innobase/row/row0undo.c new file mode 100644 index 00000000000..abe73cbe705 --- /dev/null +++ b/storage/innobase/row/row0undo.c @@ -0,0 +1,350 @@ +/****************************************************** +Row undo + +(c) 1997 Innobase Oy + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#include "row0undo.h" + +#ifdef UNIV_NONINL +#include "row0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0uins.h" +#include "row0umod.h" +#include "row0mysql.h" +#include "srv0srv.h" + +/* How to undo row operations? +(1) For an insert, we have stored a prefix of the clustered index record +in the undo log. Using it, we look for the clustered record, and using +that we look for the records in the secondary indexes. The insert operation +may have been left incomplete, if the database crashed, for example. +We may have look at the trx id and roll ptr to make sure the record in the +clustered index is really the one for which the undo log record was +written. We can use the framework we get from the original insert op. +(2) Delete marking: We can use the framework we get from the original +delete mark op. We only have to check the trx id. +(3) Update: This may be the most complicated. We have to use the framework +we get from the original update op. + +What if the same trx repeatedly deletes and inserts an identical row. +Then the row id changes and also roll ptr. What if the row id was not +part of the ordering fields in the clustered index? Maybe we have to write +it to undo log. Well, maybe not, because if we order the row id and trx id +in descending order, then the only undeleted copy is the first in the +index. Our searches in row operations always position the cursor before +the first record in the result set. But, if there is no key defined for +a table, then it would be desirable that row id is in ascending order. +So, lets store row id in descending order only if it is not an ordering +field in the clustered index. + +NOTE: Deletes and inserts may lead to situation where there are identical +records in a secondary index. Is that a problem in the B-tree? Yes. +Also updates can lead to this, unless trx id and roll ptr are included in +ord fields. +(1) Fix in clustered indexes: include row id, trx id, and roll ptr +in node pointers of B-tree. +(2) Fix in secondary indexes: include all fields in node pointers, and +if an entry is inserted, check if it is equal to the right neighbor, +in which case update the right neighbor: the neighbor must be delete +marked, set it unmarked and write the trx id of the current transaction. + +What if the same trx repeatedly updates the same row, updating a secondary +index field or not? Updating a clustered index ordering field? + +(1) If it does not update the secondary index and not the clustered index +ord field. Then the secondary index record stays unchanged, but the +trx id in the secondary index record may be smaller than in the clustered +index record. This is no problem? +(2) If it updates secondary index ord field but not clustered: then in +secondary index there are delete marked records, which differ in an +ord field. No problem. +(3) Updates clustered ord field but not secondary, and secondary index +is unique. Then the record in secondary index is just updated at the +clustered ord field. +(4) + +Problem with duplicate records: +Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a +bigger trx id has inserted and delete marked a similar row, our trx inserts +again a similar row, and a trx with an even bigger id delete marks it. Then +the position of the row should change in the index if the trx id affects +the alphabetical ordering. + +Fix 2: If an insert encounters a similar row marked deleted, we turn the +insert into an 'update' of the row marked deleted. Then we must write undo +info on the update. A problem: what if a purge operation tries to remove +the delete marked row? + +We can think of the database row versions as a linked list which starts +from the record in the clustered index, and is linked by roll ptrs +through undo logs. The secondary index records are references which tell +what kinds of records can be found in this linked list for a record +in the clustered index. + +How to do the purge? A record can be removed from the clustered index +if its linked list becomes empty, i.e., the row has been marked deleted +and its roll ptr points to the record in the undo log we are going through, +doing the purge. Similarly, during a rollback, a record can be removed +if the stored roll ptr in the undo log points to a trx already (being) purged, +or if the roll ptr is NULL, i.e., it was a fresh insert. */ + +/************************************************************************ +Creates a row undo node to a query graph. */ + +undo_node_t* +row_undo_node_create( +/*=================*/ + /* out, own: undo node */ + trx_t* trx, /* in: transaction */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + undo_node_t* undo; + + ut_ad(trx && parent && heap); + + undo = mem_heap_alloc(heap, sizeof(undo_node_t)); + + undo->common.type = QUE_NODE_UNDO; + undo->common.parent = parent; + + undo->state = UNDO_NODE_FETCH_NEXT; + undo->trx = trx; + + btr_pcur_init(&(undo->pcur)); + + undo->heap = mem_heap_create(256); + + return(undo); +} + +/*************************************************************** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. */ + +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + /* out: TRUE if found; NOTE the node->pcur + must be closed by the caller, regardless of + the return value */ + undo_node_t* node) /* in: row undo node */ +{ + dict_index_t* clust_index; + ibool found; + mtr_t mtr; + ibool ret; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + mtr_start(&mtr); + + clust_index = dict_table_get_first_index(node->table); + + found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, + node->table, node->ref, &mtr); + + rec = btr_pcur_get_rec(&(node->pcur)); + + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, + row_get_rec_roll_ptr(rec, clust_index, offsets))) { + + /* We must remove the reservation on the undo log record + BEFORE releasing the latch on the clustered index page: this + is to make sure that some thread will eventually undo the + modification corresponding to node->roll_ptr. */ + + /* fputs("--------------------undoing a previous version\n", + stderr); */ + + ret = FALSE; + } else { + node->row = row_build(ROW_COPY_DATA, clust_index, rec, + offsets, node->heap); + btr_pcur_store_position(&(node->pcur), &mtr); + + ret = TRUE; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (heap) { + mem_heap_free(heap); + } + return(ret); +} + +/*************************************************************** +Fetches an undo log record and does the undo for the recorded operation. +If none left, or a partial rollback completed, returns control to the +parent node, which is always a query thread node. */ +static +ulint +row_undo( +/*=====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + trx_t* trx; + dulint roll_ptr; + ibool froze_data_dict = FALSE; + + ut_ad(node && thr); + + trx = node->trx; + + if (node->state == UNDO_NODE_FETCH_NEXT) { + + node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, + trx->roll_limit, + &roll_ptr, + node->heap); + if (!node->undo_rec) { + /* Rollback completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + + } else if (node->state == UNDO_NODE_PREV_VERS) { + + /* Undo should be done to the same clustered index record + again in this same rollback, restoring the previous version */ + + roll_ptr = node->new_roll_ptr; + + node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr, + node->heap); + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + } + + /* Prevent DROP TABLE etc. while we are rolling back this row. + If we are doing a TABLE CREATE or some other dictionary operation, + then we already have dict_operation_lock locked in x-mode. Do not + try to lock again in s-mode, because that would cause a hang. */ + + if (trx->dict_operation_lock_mode == 0) { + + row_mysql_freeze_data_dictionary(trx); + + froze_data_dict = TRUE; + } + + if (node->state == UNDO_NODE_INSERT) { + + err = row_undo_ins(node); + + node->state = UNDO_NODE_FETCH_NEXT; + } else { + ut_ad(node->state == UNDO_NODE_MODIFY); + err = row_undo_mod(node, thr); + } + + if (froze_data_dict) { + + row_mysql_unfreeze_data_dictionary(trx); + } + + /* Do some cleanup */ + btr_pcur_close(&(node->pcur)); + + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(err); +} + +/*************************************************************** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. */ + +que_thr_t* +row_undo_step( +/*==========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + undo_node_t* node; + trx_t* trx; + + ut_ad(thr); + + srv_activity_count++; + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); + + err = row_undo(node, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* SQL error detected */ + + fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", + (ulong) err); + + if (err == DB_OUT_OF_FILE_SPACE) { + fprintf(stderr, + "InnoDB: Error 13 means out of tablespace.\n" + "InnoDB: Consider increasing your tablespace.\n"); + + exit(1); + } + + ut_error; + + return(NULL); + } + + return(thr); +} diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c new file mode 100644 index 00000000000..3305724a89b --- /dev/null +++ b/storage/innobase/row/row0upd.c @@ -0,0 +1,2035 @@ +/****************************************************** +Update of a row + +(c) 1996 Innobase Oy + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "row0upd.h" + +#ifdef UNIV_NONINL +#include "row0upd.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "mach0data.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "que0que.h" +#include "row0ins.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0sym.h" +#include "eval0eval.h" + + +/* What kind of latch and lock can we assume when the control comes to + ------------------------------------------------------------------- +an update node? +-------------- +Efficiency of massive updates would require keeping an x-latch on a +clustered index page through many updates, and not setting an explicit +x-lock on clustered index records, as they anyway will get an implicit +x-lock when they are updated. A problem is that the read nodes in the +graph should know that they must keep the latch when passing the control +up to the update node, and not set any record lock on the record which +will be updated. Another problem occurs if the execution is stopped, +as the kernel switches to another query thread, or the transaction must +wait for a lock. Then we should be able to release the latch and, maybe, +acquire an explicit x-lock on the record. + Because this seems too complicated, we conclude that the less +efficient solution of releasing all the latches when the control is +transferred to another node, and acquiring explicit x-locks, is better. */ + +/* How is a delete performed? If there is a delete without an +explicit cursor, i.e., a searched delete, there are at least +two different situations: +the implicit select cursor may run on (1) the clustered index or +on (2) a secondary index. The delete is performed by setting +the delete bit in the record and substituting the id of the +deleting transaction for the original trx id, and substituting a +new roll ptr for previous roll ptr. The old trx id and roll ptr +are saved in the undo log record. Thus, no physical changes occur +in the index tree structure at the time of the delete. Only +when the undo log is purged, the index records will be physically +deleted from the index trees. + +The query graph executing a searched delete would consist of +a delete node which has as a subtree a select subgraph. +The select subgraph should return a (persistent) cursor +in the clustered index, placed on page which is x-latched. +The delete node should look for all secondary index records for +this clustered index entry and mark them as deleted. When is +the x-latch freed? The most efficient way for performing a +searched delete is obviously to keep the x-latch for several +steps of query graph execution. */ + +/*************************************************************** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + /* out: TRUE if changes */ + dtuple_t* entry, /* in: old value of index entry */ + dict_index_t* index, /* in: index of entry */ + upd_t* update, /* in: update vector for the row */ + ulint n); /* in: how many first fields to check */ + + +/************************************************************************* +Checks if index currently is mentioned as a referenced index in a foreign +key constraint. */ +static +ibool +row_upd_index_is_referenced( +/*========================*/ + /* out: TRUE if referenced; NOTE that since + we do not hold dict_operation_lock + when leaving the function, it may be that + the referencing table has been dropped when + we leave this function: this function is only + for heuristic use! */ + dict_index_t* index, /* in: index */ + trx_t* trx) /* in: transaction */ +{ + dict_table_t* table = index->table; + dict_foreign_t* foreign; + ibool froze_data_dict = FALSE; + + if (!UT_LIST_GET_FIRST(table->referenced_list)) { + + return(FALSE); + } + + if (trx->dict_operation_lock_mode == 0) { + row_mysql_freeze_data_dictionary(trx); + froze_data_dict = TRUE; + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + if (foreign->referenced_index == index) { + + if (froze_data_dict) { + row_mysql_unfreeze_data_dictionary(trx); + } + + return(TRUE); + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (froze_data_dict) { + row_mysql_unfreeze_data_dictionary(trx); + } + + return(FALSE); +} + +/************************************************************************* +Checks if possible foreign key constraints hold after a delete of the record +under pcur. NOTE that this function will temporarily commit mtr and lose the +pcur position! */ +static +ulint +row_upd_check_references_constraints( +/*=================================*/ + /* out: DB_SUCCESS or an error code */ + upd_node_t* node, /* in: row update node */ + btr_pcur_t* pcur, /* in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /* in: table in question */ + dict_index_t* index, /* in: index of the cursor */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + trx_t* trx; + rec_t* rec; + ulint err; + ibool got_s_lock = FALSE; + + if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) { + + return(DB_SUCCESS); + } + + trx = thr_get_trx(thr); + + rec = btr_pcur_get_rec(pcur); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); + + mtr_commit(mtr); + + mtr_start(mtr); + + if (trx->dict_operation_lock_mode == 0) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->referenced_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary(entry, index, + node->update, foreign->n_fields))) { + + if (foreign->foreign_table == NULL) { + dict_table_get(foreign->foreign_table_name, + trx); + } + + if (foreign->foreign_table) { + mutex_enter(&(dict_sys->mutex)); + + (foreign->foreign_table + ->n_foreign_key_checks_running)++; + + mutex_exit(&(dict_sys->mutex)); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects 'foreign' from + being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint(FALSE, foreign, + table, entry, thr); + + if (foreign->foreign_table) { + mutex_enter(&(dict_sys->mutex)); + + ut_a(foreign->foreign_table + ->n_foreign_key_checks_running > 0); + + (foreign->foreign_table + ->n_foreign_key_checks_running)--; + + mutex_exit(&(dict_sys->mutex)); + } + + if (err != DB_SUCCESS) { + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary( + trx); + } + + mem_heap_free(heap); + + return(err); + } + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/************************************************************************* +Creates an update node for a query graph. */ + +upd_node_t* +upd_node_create( +/*============*/ + /* out, own: update node */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + upd_node_t* node; + + node = mem_heap_alloc(heap, sizeof(upd_node_t)); + node->common.type = QUE_NODE_UPDATE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + node->select_will_do_update = FALSE; + node->in_mysql_interface = FALSE; + + node->row = NULL; + node->ext_vec = NULL; + node->index = NULL; + node->update = NULL; + + node->foreign = NULL; + node->cascade_heap = NULL; + node->cascade_node = NULL; + + node->select = NULL; + + node->heap = mem_heap_create(128); + node->magic_n = UPD_NODE_MAGIC_N; + + node->cmpl_info = 0; + + return(node); +} + +/************************************************************************* +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ + +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ +{ + byte* field; + ulint len; + + field = rec_get_nth_field(rec, offsets, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + trx_write_trx_id(field, trx_id); + + field = rec_get_nth_field(rec, offsets, pos + 1, &len); + ut_ad(len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(field, roll_ptr); +} + +/************************************************************************* +Sets the trx id or roll ptr field of a clustered index entry. */ + +void +row_upd_index_entry_sys_field( +/*==========================*/ + dtuple_t* entry, /* in: index entry, where the memory buffers + for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + dulint val) /* in: value to write */ +{ + dfield_t* dfield; + byte* field; + ulint pos; + + ut_ad(index->type & DICT_CLUSTERED); + + pos = dict_index_get_sys_col_pos(index, type); + + dfield = dtuple_get_nth_field(entry, pos); + field = dfield_get_data(dfield); + + if (type == DATA_TRX_ID) { + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + trx_write_roll_ptr(field, val); + } +} + +/*************************************************************** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. */ + +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + /* out: TRUE if the update changes the size of + some field in index or the field is external + in rec or update */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + upd_t* update) /* in: update vector */ +{ + upd_field_t* upd_field; + dfield_t* new_val; + ulint old_len; + ulint new_len; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + new_len = new_val->len; + + if (new_len == UNIV_SQL_NULL && !rec_offs_comp(offsets)) { + /* A bug fixed on Dec 31st, 2004: we looked at the + SQL NULL size from the wrong field! We may backport + this fix also to 4.0. The merge to 5.0 will be made + manually immediately after we commit this to 4.1. */ + + new_len = dtype_get_sql_null_size( + dict_index_get_nth_type(index, + upd_field->field_no)); + } + + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + + if (old_len != new_len) { + + return(TRUE); + } + + if (rec_offs_nth_extern(offsets, upd_field->field_no)) { + + return(TRUE); + } + + if (upd_field->extern_storage) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the record +given. No field size changes are allowed. This function is used only for +a clustered index */ + +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector */ +{ + upd_field_t* upd_field; + dfield_t* new_val; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits); + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + rec_set_nth_field(rec, offsets, upd_field->field_no, + dfield_get_data(new_val), + dfield_get_len(new_val)); + } +} + +/************************************************************************* +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. */ + +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + /* out: new pointer to mlog */ + dict_index_t* index, /* in: clustered index */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr,/* in: roll ptr of the undo log record */ + byte* log_ptr,/* pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr __attribute__((unused))) /* in: mtr */ +{ + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr); + + log_ptr += mach_write_compressed(log_ptr, + dict_index_get_sys_col_pos(index, DATA_TRX_ID)); + + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + + log_ptr += mach_dulint_write_compressed(log_ptr, trx->id); + + return(log_ptr); +} + +/************************************************************************* +Parses the log data of system field values. */ + +byte* +row_upd_parse_sys_vals( +/*===================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint* pos, /* out: TRX_ID position in record */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr)/* out: roll ptr */ +{ + ptr = mach_parse_compressed(ptr, end_ptr, pos); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + DATA_ROLL_PTR_LEN) { + + return(NULL); + } + + *roll_ptr = trx_read_roll_ptr(ptr); + ptr += DATA_ROLL_PTR_LEN; + + ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id); + + return(ptr); +} + +/*************************************************************** +Writes to the redo log the new values of the fields occurring in the index. */ + +void +row_upd_index_write_log( +/*====================*/ + upd_t* update, /* in: update vector */ + byte* log_ptr,/* in: pointer to mlog buffer: must contain at least + MLOG_BUF_MARGIN bytes of free space; the buffer is + closed within this function */ + mtr_t* mtr) /* in: mtr into whose log to write */ +{ + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf_end; + ulint i; + + n_fields = upd_get_n_fields(update); + + buf_end = log_ptr + MLOG_BUF_MARGIN; + + mach_write_to_1(log_ptr, update->info_bits); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, n_fields); + + for (i = 0; i < n_fields; i++) { + + ut_ad(MLOG_BUF_MARGIN > 30); + + if (log_ptr + 30 > buf_end) { + mlog_close(mtr, log_ptr); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + + len = new_val->len; + + log_ptr += mach_write_compressed(log_ptr, upd_field->field_no); + log_ptr += mach_write_compressed(log_ptr, len); + + if (len != UNIV_SQL_NULL) { + if (log_ptr + len < buf_end) { + ut_memcpy(log_ptr, new_val->data, len); + + log_ptr += len; + } else { + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, new_val->data, len); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + } + } + + mlog_close(mtr, log_ptr); +} + +/************************************************************************* +Parses the log data written by row_upd_index_write_log. */ + +byte* +row_upd_index_parse( +/*================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + mem_heap_t* heap, /* in: memory heap where update vector is + built */ + upd_t** update_out)/* out: update vector */ +{ + upd_t* update; + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf; + ulint info_bits; + ulint i; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_bits = mach_read_from_1(ptr); + ptr++; + ptr = mach_parse_compressed(ptr, end_ptr, &n_fields); + + if (ptr == NULL) { + + return(NULL); + } + + update = upd_create(n_fields, heap); + update->info_bits = info_bits; + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + ptr = mach_parse_compressed(ptr, end_ptr, + &(upd_field->field_no)); + if (ptr == NULL) { + + return(NULL); + } + + ptr = mach_parse_compressed(ptr, end_ptr, &len); + + if (ptr == NULL) { + + return(NULL); + } + + new_val->len = len; + + if (len != UNIV_SQL_NULL) { + + if (end_ptr < ptr + len) { + + return(NULL); + } else { + buf = mem_heap_alloc(heap, len); + ut_memcpy(buf, ptr, len); + + ptr += len; + + new_val->data = buf; + } + } + } + + *update_out = update; + + return(ptr); +} + +/******************************************************************* +Returns TRUE if ext_vec contains i. */ +static +ibool +upd_ext_vec_contains( +/*=================*/ + /* out: TRUE if i is in ext_vec */ + ulint* ext_vec, /* in: array of indexes or NULL */ + ulint n_ext_vec, /* in: number of numbers in ext_vec */ + ulint i) /* in: a number */ +{ + ulint j; + + if (ext_vec == NULL) { + + return(FALSE); + } + + for (j = 0; j < n_ext_vec; j++) { + if (ext_vec[j] == i) { + + return(TRUE); + } + } + + return(FALSE); +} + +/******************************************************************* +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! */ + +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + /* out, own: update vector of differing + fields */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: entry to insert */ + rec_t* rec, /* in: secondary index record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + dfield_t* dfield; + byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint i; + ulint offsets_[REC_OFFS_SMALL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + /* This function is used only for a secondary index */ + ut_a(0 == (index->type & DICT_CLUSTERED)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE that it may be that len != dfield_get_len(dfield) if we + are updating in a character set and collation where strings of + different length can be equal in an alphabetical comparison, + and also in the case where we have a column prefix index + and the last characters in the index field are spaces; the + latter case probably caused the assertion failures reported at + row0upd.c line 713 in versions 4.0.14 - 4.0.16. */ + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (!dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + upd_field->extern_storage = FALSE; + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + +/******************************************************************* +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! */ + +upd_t* +row_upd_build_difference_binary( +/*============================*/ + /* out, own: update vector of differing + fields, excluding roll ptr and trx id */ + dict_index_t* index, /* in: clustered index */ + dtuple_t* entry, /* in: entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ + rec_t* rec, /* in: clustered index record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + dfield_t* dfield; + byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint roll_ptr_pos; + ulint trx_id_pos; + ibool extern_bit; + ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + /* This function is used only for a clustered index */ + ut_a(index->type & DICT_CLUSTERED); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); + trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (i == trx_id_pos || i == roll_ptr_pos) { + + goto skip_compare; + } + + extern_bit = rec_offs_nth_extern(offsets, i); + + if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i) + || !dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) { + upd_field->extern_storage = TRUE; + } else { + upd_field->extern_storage = FALSE; + } + + n_diff++; + } +skip_compare: + ; + } + + update->n_fields = n_diff; + + return(update); +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ + +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /* in/out: index entry where replaced */ + dict_index_t* index, /* in: index; NOTE that this may also be a + non-clustered index */ + upd_t* update, /* in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + mem_heap_t* heap) /* in: memory heap to which we allocate and + copy the new values, set this as NULL if you + do not want allocation */ +{ + dict_field_t* field; + upd_field_t* upd_field; + dfield_t* dfield; + dfield_t* new_val; + ulint j; + ulint i; + dtype_t* cur_type; + + ut_ad(index); + + dtuple_set_info_bits(entry, update->info_bits); + + for (j = 0; j < dict_index_get_n_fields(index); j++) { + + field = dict_index_get_nth_field(index, j); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (upd_field->field_no == j) { + + dfield = dtuple_get_nth_field(entry, j); + + new_val = &(upd_field->new_val); + + dfield_set_data(dfield, new_val->data, + new_val->len); + if (heap && new_val->len != UNIV_SQL_NULL) { + dfield->data = mem_heap_alloc(heap, + new_val->len); + ut_memcpy(dfield->data, new_val->data, + new_val->len); + } + + if (field->prefix_len > 0 + && new_val->len != UNIV_SQL_NULL) { + + cur_type = dict_col_get_type( + dict_field_get_col(field)); + + dfield->len = + dtype_get_at_most_n_mbchars( + cur_type, + field->prefix_len, + new_val->len, + new_val->data); + } + } + } + } +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ + +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /* in/out: index entry where replaced */ + dict_index_t* index, /* in: index; NOTE that this may also be a + non-clustered index */ + upd_t* update, /* in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /* in: memory heap to which we allocate and + copy the new values, set this as NULL if you + do not want allocation */ +{ + dict_field_t* field; + upd_field_t* upd_field; + dfield_t* dfield; + dfield_t* new_val; + ulint j; + ulint i; + dtype_t* cur_type; + + ut_ad(index); + + dtuple_set_info_bits(entry, update->info_bits); + + for (j = 0; j < dict_index_get_n_fields(index); j++) { + + field = dict_index_get_nth_field(index, j); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (upd_field->field_no == field->col->clust_pos) { + + dfield = dtuple_get_nth_field(entry, j); + + new_val = &(upd_field->new_val); + + dfield_set_data(dfield, new_val->data, + new_val->len); + if (heap && new_val->len != UNIV_SQL_NULL) { + dfield->data = mem_heap_alloc(heap, + new_val->len); + ut_memcpy(dfield->data, new_val->data, + new_val->len); + } + + if (field->prefix_len > 0 + && new_val->len != UNIV_SQL_NULL) { + + cur_type = dict_col_get_type( + dict_field_get_col(field)); + + dfield->len = + dtype_get_at_most_n_mbchars( + cur_type, + field->prefix_len, + new_val->len, + new_val->data); + } + } + } + } +} + +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! */ + +ibool +row_upd_changes_ord_field_binary( +/*=============================*/ + /* out: TRUE if update vector changes + an ordering field in the index record; + NOTE: the fields are compared as binary + strings */ + dtuple_t* row, /* in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + dict_index_t* index, /* in: index of the record */ + upd_t* update) /* in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +{ + upd_field_t* upd_field; + dict_field_t* ind_field; + dict_col_t* col; + ulint n_unique; + ulint n_upd_fields; + ulint col_pos; + ulint col_no; + ulint i, j; + + ut_ad(update && index); + + n_unique = dict_index_get_n_unique(index); + n_upd_fields = upd_get_n_fields(update); + + for (i = 0; i < n_unique; i++) { + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col); + col_no = dict_col_get_no(col); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field = upd_get_nth_field(update, j); + + /* Note that if the index field is a column prefix + then it may be that row does not contain an externally + stored part of the column value, and we cannot compare + the datas */ + + if (col_pos == upd_field->field_no + && (row == NULL + || ind_field->prefix_len > 0 + || !dfield_datas_are_binary_equal( + dtuple_get_nth_field(row, col_no), + &(upd_field->new_val)))) { + return(TRUE); + } + } + } + + return(FALSE); +} + +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +NOTE: we compare the fields as binary strings! */ + +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + /* out: TRUE if update vector may change + an ordering field in an index record */ + dict_table_t* table, /* in: table */ + upd_t* update) /* in: update vector for the row */ +{ + upd_field_t* upd_field; + dict_index_t* index; + ulint i; + + index = dict_table_get_first_index(table); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (dict_field_get_col(dict_index_get_nth_field(index, + upd_field->field_no)) + ->ord_part) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*************************************************************** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + /* out: TRUE if changes */ + dtuple_t* entry, /* in: index entry */ + dict_index_t* index, /* in: index of entry */ + upd_t* update, /* in: update vector for the row */ + ulint n) /* in: how many first fields to check */ +{ + upd_field_t* upd_field; + dict_field_t* ind_field; + dict_col_t* col; + ulint n_upd_fields; + ulint col_pos; + ulint i, j; + + ut_a(update && index); + ut_a(n <= dict_index_get_n_fields(index)); + + n_upd_fields = upd_get_n_fields(update); + + for (i = 0; i < n; i++) { + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col); + + ut_a(ind_field->prefix_len == 0); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field = upd_get_nth_field(update, j); + + if (col_pos == upd_field->field_no + && !dfield_datas_are_binary_equal( + dtuple_get_nth_field(entry, i), + &(upd_field->new_val))) { + return(TRUE); + } + } + } + + return(FALSE); +} + +/************************************************************************* +Copies the column values from a record. */ +UNIV_INLINE +void +row_upd_copy_columns( +/*=================*/ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + byte* data; + ulint len; + + while (column) { + data = rec_get_nth_field(rec, offsets, + column->field_nos[SYM_CLUST_FIELD_NO], + &len); + eval_node_copy_and_alloc_val(column, data, len); + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Calculates the new values for fields to update. Note that row_upd_copy_columns +must have been called first. */ +UNIV_INLINE +void +row_upd_eval_new_vals( +/*==================*/ + upd_t* update) /* in: update vector */ +{ + que_node_t* exp; + upd_field_t* upd_field; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + exp = upd_field->exp; + + eval_exp(exp); + + dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp)); + } +} + +/*************************************************************** +Stores to the heap the row on which the node->pcur is positioned. */ +static +void +row_upd_store_row( +/*==============*/ + upd_node_t* node) /* in: row update node */ +{ + dict_index_t* clust_index; + upd_t* update; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); + + if (node->row != NULL) { + mem_heap_empty(node->heap); + node->row = NULL; + } + + clust_index = dict_table_get_first_index(node->table); + + rec = btr_pcur_get_rec(node->pcur); + + offsets = rec_get_offsets(rec, clust_index, offsets_, + ULINT_UNDEFINED, &heap); + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + node->heap); + node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint) + * rec_offs_n_fields(offsets)); + if (node->is_delete) { + update = NULL; + } else { + update = node->update; + } + + node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec, + offsets, update); + if (heap) { + mem_heap_free(heap); + } +} + +/*************************************************************** +Updates a secondary index entry of a row. */ +static +ulint +row_upd_sec_index_entry( +/*====================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ibool check_ref; + ibool found; + dict_index_t* index; + dtuple_t* entry; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + mem_heap_t* heap; + rec_t* rec; + ulint err = DB_SUCCESS; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + + index = node->index; + + check_ref = row_upd_index_is_referenced(index, trx); + + heap = mem_heap_create(1024); + + /* Build old index entry */ + entry = row_build_index_entry(node->row, index, heap); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, + &mtr); + btr_cur = btr_pcur_get_btr_cur(&pcur); + + rec = btr_cur_get_rec(btr_cur); + + if (!found) { + fputs("InnoDB: error in sec index entry update in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + putc('\n', stderr); + + trx_print(stderr, trx); + + fputs("\n" +"InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); + } else { + /* Delete mark the old index record; it can already be + delete marked if we return after a lock wait in + row_ins_index_entry below */ + + if (!rec_get_deleted_flag(rec, index->table->comp)) { + err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, + thr, &mtr); + if (err == DB_SUCCESS && check_ref) { + + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, + &pcur, index->table, + index, thr, &mtr); + if (err != DB_SUCCESS) { + + goto close_cur; + } + } + + } + } +close_cur: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->is_delete || err != DB_SUCCESS) { + + mem_heap_free(heap); + + return(err); + } + + /* Build a new index entry */ + row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); + + /* Insert new index entry */ + err = row_ins_index_entry(index, entry, NULL, 0, thr); + + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Updates secondary index record if it is changed in the row update. This +should be quite rare in database applications. */ +UNIV_INLINE +ulint +row_upd_sec_step( +/*=============*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC) + || (node->state == UPD_NODE_UPDATE_SOME_SEC)); + ut_ad(!(node->index->type & DICT_CLUSTERED)); + + if (node->state == UPD_NODE_UPDATE_ALL_SEC + || row_upd_changes_ord_field_binary(node->row, node->index, + node->update)) { + err = row_upd_sec_index_entry(node, thr); + + return(err); + } + + return(DB_SUCCESS); +} + +/*************************************************************** +Marks the clustered index record deleted and inserts the updated version +of the record to the index. This function should be used when the ordering +fields of the clustered index record change. This should be quite rare in +database applications. */ +static +ulint +row_upd_clust_rec_by_insert( +/*========================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index of the record */ + que_thr_t* thr, /* in: query thread */ + ibool check_ref,/* in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + mem_heap_t* heap = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + trx_t* trx; + dict_table_t* table; + dtuple_t* entry; + ulint err; + + ut_ad(node); + ut_ad(index->type & DICT_CLUSTERED); + + trx = thr_get_trx(thr); + table = node->table; + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + if (node->state != UPD_NODE_INSERT_CLUSTERED) { + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + return(err); + } + + /* Mark as not-owned the externally stored fields which the new + row inherits from the delete marked record: purge should not + free those externally stored fields even if the delete marked + record is removed from the index tree, or updated. */ + + btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur), + rec_get_offsets(btr_cur_get_rec(btr_cur), + dict_table_get_first_index(table), offsets_, + ULINT_UNDEFINED, &heap), node->update, mtr); + if (check_ref) { + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints(node, + pcur, table, + index, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + if (heap) { + mem_heap_free(heap); + } + return(err); + } + } + + } + + mtr_commit(mtr); + + if (!heap) { + heap = mem_heap_create(500); + } + node->state = UPD_NODE_INSERT_CLUSTERED; + + entry = row_build_index_entry(node->row, index, heap); + + row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); + + row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); + + /* If we return from a lock wait, for example, we may have + extern fields marked as not-owned in entry (marked in the + if-branch above). We must unmark them. */ + + btr_cur_unmark_dtuple_extern_fields(entry, node->ext_vec, + node->n_ext_vec); + /* We must mark non-updated extern fields in entry as inherited, + so that a possible rollback will not free them */ + + btr_cur_mark_dtuple_inherited_extern(entry, node->ext_vec, + node->n_ext_vec, + node->update); + + err = row_ins_index_entry(index, entry, node->ext_vec, + node->n_ext_vec, thr); + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Updates a clustered index record of a row when the ordering fields do +not change. */ +static +ulint +row_upd_clust_rec( +/*==============*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + big_rec_t* big_rec = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(index->type & DICT_CLUSTERED); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); + + /* Try optimistic updating of the record, keeping changes within + the page; we do not check locks because we assume the x-lock on the + record to update */ + + if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { + err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } + + mtr_commit(mtr); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* We may have to modify the tree structure: do a pessimistic descent + down the index tree */ + + mtr_start(mtr); + + /* NOTE: this transaction has an s-lock or x-lock on the record and + therefore other transactions cannot modify the record when we have no + latch on the page. In addition, we assume that other query threads of + the same transaction do not modify the record in the meantime. + Therefore we can assert that the restoration of the cursor succeeds. */ + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); + + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, + &big_rec, node->update, + node->cmpl_info, thr, mtr); + mtr_commit(mtr); + + if (err == DB_SUCCESS && big_rec) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + mtr_start(mtr); + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + rec = btr_cur_get_rec(btr_cur); + err = btr_store_big_rec_extern_fields(index, rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + big_rec, mtr); + if (heap) { + mem_heap_free(heap); + } + mtr_commit(mtr); + } + + if (big_rec) { + dtuple_big_rec_free(big_rec); + } + + return(err); +} + +/*************************************************************** +Delete marks a clustered index record. */ +static +ulint +row_upd_del_mark_clust_rec( +/*=======================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index */ + que_thr_t* thr, /* in: query thread */ + ibool check_ref,/* in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(node->is_delete); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Store row because we have to build also the secondary index + entries */ + + row_upd_store_row(node); + + /* Mark the clustered index record deleted; we do not have to check + locks, because we assume that we have an x-lock on the record */ + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err == DB_SUCCESS && check_ref) { + /* NOTE that the following call loses the position of pcur ! */ + + err = row_upd_check_references_constraints(node, + pcur, index->table, + index, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + + return(err); + } + } + + mtr_commit(mtr); + + return(err); +} + +/*************************************************************** +Updates the clustered index record. */ +static +ulint +row_upd_clust_step( +/*===============*/ + /* out: DB_SUCCESS if operation successfully + completed, DB_LOCK_WAIT in case of a lock wait, + else error code */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + ibool success; + ibool check_ref; + ulint err; + mtr_t* mtr; + mtr_t mtr_buf; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + index = dict_table_get_first_index(node->table); + + check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr)); + + pcur = node->pcur; + + /* We have to restore the cursor to its position */ + mtr = &mtr_buf; + + mtr_start(mtr); + + /* If the restoration does not succeed, then the same + transaction has deleted the record on which the cursor was, + and that is an SQL error. If the restoration succeeds, it may + still be that the same transaction has successively deleted + and inserted a record with the same ordering fields, but in + that case we know that the transaction has at least an + implicit x-lock on the record. */ + + ut_a(pcur->rel_pos == BTR_PCUR_ON); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + + if (!success) { + err = DB_RECORD_NOT_FOUND; + + mtr_commit(mtr); + + return(err); + } + + /* If this is a row in SYS_INDEXES table of the data dictionary, + then we have to free the file segments of the index tree associated + with the index */ + + if (node->is_delete + && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + + dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, + mtr); + if (!success) { + err = DB_ERROR; + + mtr_commit(mtr); + + return(err); + } + } + + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (!node->has_clust_rec_x_lock) { + err = lock_clust_rec_modify_check_and_lock(0, + rec, index, offsets, thr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + goto exit_func; + } + } + + /* NOTE: the following function calls will also commit mtr */ + + if (node->is_delete) { + err = row_upd_del_mark_clust_rec(node, index, thr, check_ref, + mtr); + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + } + exit_func: + if (heap) { + mem_heap_free(heap); + } + return(err); + } + + /* If the update is made for MySQL, we already have the update vector + ready, else we have to do some evaluation: */ + + if (!node->in_mysql_interface) { + /* Copy the necessary columns from clust_rec and calculate the + new values to set */ + row_upd_copy_columns(rec, offsets, + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + } + + if (heap) { + mem_heap_free(heap); + } + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + err = row_upd_clust_rec(node, index, thr, mtr); + return(err); + } + + row_upd_store_row(node); + + if (row_upd_changes_ord_field_binary(node->row, index, node->update)) { + + /* Update causes an ordering field (ordering fields within + the B-tree) of the clustered index record to change: perform + the update by delete marking and inserting. + + TODO! What to do to the 'Halloween problem', where an update + moves the record forward in index so that it is again + updated when the cursor arrives there? Solution: the + read operation must check the undo record undo number when + choosing records to update. MySQL solves now the problem + externally! */ + + err = row_upd_clust_rec_by_insert(node, index, thr, check_ref, + mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_ALL_SEC; + } else { + err = row_upd_clust_rec(node, index, thr, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_SOME_SEC; + } + + node->index = dict_table_get_next_index(index); + + return(err); +} + +/*************************************************************** +Updates the affected index records of a row. When the control is transferred +to this node, we assume that we have a persistent cursor which was on a +record, and the position of the cursor is stored in the cursor. */ +static +ulint +row_upd( +/*====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err = DB_SUCCESS; + + ut_ad(node && thr); + + if (node->in_mysql_interface) { + + /* We do not get the cmpl_info value from the MySQL + interpreter: we must calculate it on the fly: */ + + if (node->is_delete || + row_upd_changes_some_index_ord_field_binary( + node->table, node->update)) { + node->cmpl_info = 0; + } else { + node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; + } + } + + if (node->state == UPD_NODE_UPDATE_CLUSTERED + || node->state == UPD_NODE_INSERT_CLUSTERED) { + + err = row_upd_clust_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + } + + if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + + goto function_exit; + } + + while (node->index != NULL) { + err = row_upd_sec_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->index = dict_table_get_next_index(node->index); + } + +function_exit: + if (err == DB_SUCCESS) { + /* Do some cleanup */ + + if (node->row != NULL) { + node->row = NULL; + node->n_ext_vec = 0; + mem_heap_empty(node->heap); + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + } + + return(err); +} + +/*************************************************************** +Updates a row in a table. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_upd_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + upd_node_t* node; + sel_node_t* sel_node; + que_node_t* parent; + ulint err = DB_SUCCESS; + trx_t* trx; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started(trx); + + node = thr->run_node; + + sel_node = node->select; + + parent = que_node_get_parent(node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + if (thr->prev_node == parent) { + node->state = UPD_NODE_SET_IX_LOCK; + } + + if (node->state == UPD_NODE_SET_IX_LOCK) { + + if (!node->has_clust_rec_x_lock) { + /* It may be that the current session has not yet + started its transaction, or it has been committed: */ + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + if (node->searched_update) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to update */ + + thr->run_node = sel_node; + + return(thr); + } + } + + /* sel_node is NULL if we are in the MySQL interface */ + + if (sel_node && (sel_node->state != SEL_NODE_FETCH)) { + + if (!node->searched_update) { + /* An explicit cursor should be positioned on a row + to update */ + + ut_error; + + err = DB_ERROR; + + goto error_handling; + } + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to update, or the select node performed the + updates directly in-place */ + + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_upd(node, thr); + +error_handling: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->searched_update) { + /* Fetch next row to update */ + + thr->run_node = sel_node; + } else { + /* It was an explicit cursor update */ + + thr->run_node = parent; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + return(thr); +} + +/************************************************************************* +Performs an in-place update for the current clustered index record in +select. */ + +void +row_upd_in_place_in_select( +/*=======================*/ + sel_node_t* sel_node, /* in: select node */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + upd_node_t* node; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(sel_node->select_will_do_update); + ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF); + ut_ad(sel_node->asc); + + node = que_node_get_parent(sel_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Copy the necessary columns from clust_rec and calculate the new + values to set */ + + row_upd_copy_columns(btr_pcur_get_rec(pcur), rec_get_offsets( + btr_pcur_get_rec(pcur), btr_cur->index, offsets_, + ULINT_UNDEFINED, &heap), + UT_LIST_GET_FIRST(node->columns)); + if (heap) { + mem_heap_free(heap); + } + row_upd_eval_new_vals(node->update); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + btr_cur->index->table->comp)); + + ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE); + ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); + ut_ad(node->select_will_do_update); + + err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur, + node->update, node->cmpl_info, + thr, mtr); + ut_ad(err == DB_SUCCESS); +} diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c new file mode 100644 index 00000000000..36f6c27636d --- /dev/null +++ b/storage/innobase/row/row0vers.c @@ -0,0 +1,492 @@ +/****************************************************** +Row versions + +(c) 1997 Innobase Oy + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0vers.h" + +#ifdef UNIV_NONINL +#include "row0vers.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "lock0lock.h" + +/********************************************************************* +Finds out if an active transaction has inserted or modified a secondary +index record. NOTE: the kernel mutex is temporarily released in this +function! */ + +trx_t* +row_vers_impl_x_locked_off_kernel( +/*==============================*/ + /* out: NULL if committed, else the active + transaction; NOTE that the kernel mutex is + temporarily released! */ + rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + ulint* clust_offsets; + rec_t* version; + rec_t* prev_version; + dulint trx_id; + dulint prev_trx_id; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + dtuple_t* entry = NULL; /* assignment to eliminate compiler + warning */ + trx_t* trx; + ibool vers_del; + ibool rec_del; + ulint err; + mtr_t mtr; + ibool comp; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Search for the clustered index record: this is a time-consuming + operation: therefore we release the kernel mutex; also, the release + is required by the latching order convention. The latch on the + clustered index locks the top of the stack of versions. We also + reserve purge_latch to lock the bottom of the version stack. */ + + clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, + &clust_index, &mtr); + if (!clust_rec) { + /* In a rare case it is possible that no clust rec is found + for a secondary index record: if in row0umod.c + row_undo_mod_remove_clust_low() we have already removed the + clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case there cannot be + any implicit lock on the secondary index record, because + an active transaction which has modified the secondary index + record has also modified the clustered index record. And in + a rollback we always undo the modifications to secondary index + records before the clustered index record. */ + + mutex_enter(&kernel_mutex); + mtr_commit(&mtr); + + return(NULL); + } + + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); + + mtr_s_lock(&(purge_sys->latch), &mtr); + + mutex_enter(&kernel_mutex); + + trx = NULL; + if (!trx_is_active(trx_id)) { + /* The transaction that modified or inserted clust_rec is no + longer active: no implicit lock on rec */ + goto exit_func; + } + + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, + clust_offsets, TRUE)) { + /* Corruption noticed: try to avoid a crash by returning */ + goto exit_func; + } + + comp = index->table->comp; + ut_ad(index->table == clust_index->table); + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + ut_ad(comp == page_is_comp(buf_frame_align(clust_rec))); + + /* We look up if some earlier version, which was modified by the trx_id + transaction, of the clustered index record would require rec to be in + a different state (delete marked or unmarked, or have different field + values, or not existing). If there is such a version, then rec was + modified by the trx_id transaction, and it has an implicit x-lock on + rec. Note that if clust_rec itself would require rec to be in a + different state, then the trx_id transaction has not yet had time to + modify rec, and does not necessarily have an implicit x-lock on rec. */ + + rec_del = rec_get_deleted_flag(rec, comp); + trx = NULL; + + version = clust_rec; + + for (;;) { + mutex_exit(&kernel_mutex); + + /* While we retrieve an earlier version of clust_rec, we + release the kernel mutex, because it may take time to access + the disk. After the release, we have to check if the trx_id + transaction is still active. We keep the semaphore in mtr on + the clust_rec page, so that no other transaction can update + it and get an implicit x-lock on rec. */ + + heap2 = heap; + heap = mem_heap_create(1024); + err = trx_undo_prev_version_build(clust_rec, &mtr, version, + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (prev_version) { + clust_offsets = rec_get_offsets(prev_version, + clust_index, NULL, + ULINT_UNDEFINED, &heap); + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, heap); + entry = row_build_index_entry(row, index, heap); + } + + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* Transaction no longer active: no implicit x-lock */ + + break; + } + + /* If the transaction is still active, the previous version + of clust_rec must be accessible if not a fresh insert; we + may assert the following: */ + + ut_ad(err == DB_SUCCESS); + + if (prev_version == NULL) { + /* It was a freshly inserted version: there is an + implicit x-lock on rec */ + + trx = trx_get_on_id(trx_id); + + break; + } + + /* If we get here, we know that the trx_id transaction is + still active and it has modified prev_version. Let us check + if prev_version would require rec to be in a different + state. */ + + vers_del = rec_get_deleted_flag(prev_version, comp); + + /* We check if entry and rec are identified in the alphabetical + ordering */ + if (0 == cmp_dtuple_rec(entry, rec, offsets)) { + /* The delete marks of rec and prev_version should be + equal for rec to be in the state required by + prev_version */ + + if (rec_del != vers_del) { + trx = trx_get_on_id(trx_id); + + break; + } + + /* It is possible that the row was updated so that the + secondary index record remained the same in + alphabetical ordering, but the field values changed + still. For example, 'abc' -> 'ABC'. Check also that. */ + + dtuple_set_types_binary(entry, + dtuple_get_n_fields(entry)); + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { + + trx = trx_get_on_id(trx_id); + + break; + } + } else if (!rec_del) { + /* The delete mark should be set in rec for it to be + in the state required by prev_version */ + + trx = trx_get_on_id(trx_id); + + break; + } + + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); + + if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { + /* The versions modified by the trx_id transaction end + to prev_version: no implicit x-lock */ + + break; + } + + version = prev_version; + }/* for (;;) */ + +exit_func: + mtr_commit(&mtr); + mem_heap_free(heap); + + return(trx); +} + +/********************************************************************* +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. */ + +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + /* out: TRUE if earlier version should be preserved */ + dulint trx_id, /* in: transaction id in the version */ + mtr_t* mtr) /* in: mtr holding the latch on the clustered index + record; it will also hold the latch on purge_view */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_s_lock(&(purge_sys->latch), mtr); + + if (trx_purge_update_undo_must_exist(trx_id)) { + + /* A purge operation is not yet allowed to remove this + delete marked record */ + + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************* +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry and ientry are identified in +the alphabetical ordering; exactly in this case we return TRUE. */ + +ibool +row_vers_old_has_index_entry( +/*=========================*/ + /* out: TRUE if earlier version should have */ + ibool also_curr,/* in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + rec_t* rec, /* in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /* in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /* in: the secondary index */ + dtuple_t* ientry) /* in: the secondary index entry */ +{ + rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + ulint* clust_offsets; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + dtuple_t* entry; + ulint err; + ibool comp; + + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + mtr_s_lock(&(purge_sys->latch), mtr); + + clust_index = dict_table_get_first_index(index->table); + + comp = index->table->comp; + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, heap); + entry = row_build_index_entry(row, index, heap); + + /* NOTE that we cannot do the comparison as binary + fields because the row is maybe being modified so that + the clustered index record has already been updated + to a different binary value in a char field, but the + collation identifies the old and new value anyway! */ + + if (dtuple_datas_are_ordering_equal(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = rec; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + err = trx_undo_prev_version_build(rec, mtr, version, + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (err != DB_SUCCESS || !prev_version) { + /* Versions end here */ + + mem_heap_free(heap); + + return(FALSE); + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, heap); + entry = row_build_index_entry(row, index, heap); + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (dtuple_datas_are_ordering_equal(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = prev_version; + } +} + +/********************************************************************* +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. */ + +ulint +row_vers_build_for_consistent_read( +/*===============================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /* in: the consistent read view */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/* out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + rec_t* version; + rec_t* prev_version; + dulint prev_trx_id; + mem_heap_t* heap = NULL; + byte* buf; + ulint err; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + ut_ad(!read_view_sees_trx_id(view, + row_get_rec_trx_id(rec, index, *offsets))); + + rw_lock_s_lock(&(purge_sys->latch)); + version = rec; + + for (;;) { + mem_heap_t* heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, &prev_version); + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (err != DB_SUCCESS) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + *offsets = rec_get_offsets(prev_version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + prev_trx_id = row_get_rec_trx_id(prev_version, index, + *offsets); + + if (read_view_sees_trx_id(view, prev_trx_id)) { + + /* The view already sees this version: we can copy + it to in_heap and return */ + + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + err = DB_SUCCESS; + + break; + } + + version = prev_version; + }/* for (;;) */ + + mem_heap_free(heap); + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} |