diff options
author | Guilhem Bichot <guilhem@mysql.com> | 2009-08-07 12:16:00 +0200 |
---|---|---|
committer | Guilhem Bichot <guilhem@mysql.com> | 2009-08-07 12:16:00 +0200 |
commit | 7f2dd1570420a8fe096324db972ae3c775113a33 (patch) | |
tree | 65b42f5cb11f29ea5b4414ff075ccafd48569ad6 /storage/innobase/row | |
parent | 10080d547e012aac4feb566fd7b51bbb485a717b (diff) | |
download | mariadb-git-7f2dd1570420a8fe096324db972ae3c775113a33.tar.gz |
Renamed storage/innodb_plugin to storage/innobase, so that 1) it's the same
layout as we always had in trees containing only the builtin
2) win\configure.js WITH_INNOBASE_STORAGE_ENGINE still works.
storage/innobase/CMakeLists.txt:
fix to new directory name (and like 5.1)
storage/innobase/Makefile.am:
fix to new directory name (and like 5.1)
storage/innobase/handler/ha_innodb.cc:
fix to new directory name (and like 5.1)
storage/innobase/plug.in:
fix to new directory name (and like 5.1)
Diffstat (limited to 'storage/innobase/row')
-rw-r--r-- | storage/innobase/row/row0ext.c | 115 | ||||
-rw-r--r-- | storage/innobase/row/row0ins.c | 2508 | ||||
-rw-r--r-- | storage/innobase/row/row0merge.c | 2364 | ||||
-rw-r--r-- | storage/innobase/row/row0mysql.c | 4241 | ||||
-rw-r--r-- | storage/innobase/row/row0purge.c | 689 | ||||
-rw-r--r-- | storage/innobase/row/row0row.c | 1168 | ||||
-rw-r--r-- | storage/innobase/row/row0sel.c | 4736 | ||||
-rw-r--r-- | storage/innobase/row/row0uins.c | 350 | ||||
-rw-r--r-- | storage/innobase/row/row0umod.c | 815 | ||||
-rw-r--r-- | storage/innobase/row/row0undo.c | 377 | ||||
-rw-r--r-- | storage/innobase/row/row0upd.c | 2177 | ||||
-rw-r--r-- | storage/innobase/row/row0vers.c | 741 |
12 files changed, 20281 insertions, 0 deletions
diff --git a/storage/innobase/row/row0ext.c b/storage/innobase/row/row0ext.c new file mode 100644 index 00000000000..7320f5b1dca --- /dev/null +++ b/storage/innobase/row/row0ext.c @@ -0,0 +1,115 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ext.c +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "row0ext.h" + +#ifdef UNIV_NONINL +#include "row0ext.ic" +#endif + +#include "btr0cur.h" + +/********************************************************************//** +Fills the column prefix cache of an externally stored column. */ +static +void +row_ext_cache_fill( +/*===============*/ + row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint zip_size,/*!< compressed page size in bytes, or 0 */ + const dfield_t* dfield) /*!< in: data field */ +{ + const byte* field = dfield_get_data(dfield); + ulint f_len = dfield_get_len(dfield); + byte* buf = ext->buf + i * REC_MAX_INDEX_COL_LEN; + + ut_ad(i < ext->n_ext); + ut_ad(dfield_is_ext(dfield)); + ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY(!memcmp(field_ref_zero, + field + f_len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* The BLOB pointer is not set: we cannot fetch it */ + ext->len[i] = 0; + } else { + /* Fetch at most REC_MAX_INDEX_COL_LEN of the column. + The column should be non-empty. However, + trx_rollback_or_clean_all_recovered() may try to + access a half-deleted BLOB if the server previously + crashed during the execution of + btr_free_externally_stored_field(). */ + ext->len[i] = btr_copy_externally_stored_field_prefix( + buf, REC_MAX_INDEX_COL_LEN, zip_size, field, f_len); + } +} + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +UNIV_INTERN +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + ulint zip_size,/*!< compressed page size in bytes, or 0 */ + mem_heap_t* heap) /*!< in: heap where created */ +{ + ulint i; + row_ext_t* ret = mem_heap_alloc(heap, (sizeof *ret) + + (n_ext - 1) * sizeof ret->len); + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + + ret->n_ext = n_ext; + ret->ext = ext; + ret->buf = mem_heap_alloc(heap, n_ext * REC_MAX_INDEX_COL_LEN); +#ifdef UNIV_DEBUG + memset(ret->buf, 0xaa, n_ext * REC_MAX_INDEX_COL_LEN); + UNIV_MEM_ALLOC(ret->buf, n_ext * REC_MAX_INDEX_COL_LEN); +#endif + + /* Fetch the BLOB prefixes */ + for (i = 0; i < n_ext; i++) { + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple, ext[i]); + row_ext_cache_fill(ret, i, zip_size, dfield); + } + + return(ret); +} diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c new file mode 100644 index 00000000000..930c9ec1fc7 --- /dev/null +++ b/storage/innobase/row/row0ins.c @@ -0,0 +1,2508 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ins.c +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0ins.h" + +#ifdef UNIV_NONINL +#include "row0ins.ic" +#endif + +#include "ha_prototypes.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "eval0eval.h" +#include "data0data.h" +#include "usr0sess.h" +#include "buf0lru.h" + +#define ROW_INS_PREV 1 +#define ROW_INS_NEXT 2 + + +/*********************************************************************//** +Creates an insert node struct. +@return own: insert node struct */ +UNIV_INTERN +ins_node_t* +ins_node_create( +/*============*/ + ulint ins_type, /*!< in: INS_VALUES, ... */ + dict_table_t* table, /*!< in: table where to insert */ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + ins_node_t* node; + + node = mem_heap_alloc(heap, sizeof(ins_node_t)); + + node->common.type = QUE_NODE_INSERT; + + node->ins_type = ins_type; + + node->state = INS_NODE_SET_IX_LOCK; + node->table = table; + node->index = NULL; + node->entry = NULL; + + node->select = NULL; + + node->trx_id = ut_dulint_zero; + + node->entry_sys_heap = mem_heap_create(128); + + node->magic_n = INS_NODE_MAGIC_N; + + return(node); +} + +/***********************************************************//** +Creates an entry template for each index of a table. */ +UNIV_INTERN +void +ins_node_create_entry_list( +/*=======================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + dict_index_t* index; + dtuple_t* entry; + + ut_ad(node->entry_sys_heap); + + UT_LIST_INIT(node->entry_list); + + index = dict_table_get_first_index(node->table); + + while (index != NULL) { + entry = row_build_index_entry(node->row, NULL, index, + node->entry_sys_heap); + UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry); + + index = dict_table_get_next_index(index); + } +} + +/*****************************************************************//** +Adds system field buffers to a row. */ +static +void +row_ins_alloc_sys_fields( +/*=====================*/ + ins_node_t* node) /*!< in: insert node */ +{ + dtuple_t* row; + dict_table_t* table; + mem_heap_t* heap; + const dict_col_t* col; + dfield_t* dfield; + byte* ptr; + + row = node->row; + table = node->table; + heap = node->entry_sys_heap; + + ut_ad(row && table && heap); + ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); + + /* 1. Allocate buffer for row id */ + + col = dict_table_get_sys_col(table, DATA_ROW_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN); + + node->row_id_buf = ptr; + + /* 3. Allocate buffer for trx id */ + + col = dict_table_get_sys_col(table, DATA_TRX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN); + + node->trx_id_buf = ptr; + + /* 4. Allocate buffer for roll ptr */ + + col = dict_table_get_sys_col(table, DATA_ROLL_PTR); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + + dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN); +} + +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +UNIV_INTERN +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row) /*!< in: new row (or first row) for the node */ +{ + node->state = INS_NODE_SET_IX_LOCK; + node->index = NULL; + node->entry = NULL; + + node->row = row; + + mem_heap_empty(node->entry_sys_heap); + + /* Create templates for index entries */ + + ins_node_create_entry_list(node); + + /* Allocate from entry_sys_heap buffers for sys fields */ + + row_ins_alloc_sys_fields(node); + + /* As we allocated a new trx id buf, the trx id should be written + there again: */ + + node->trx_id = ut_dulint_zero; +} + +/*******************************************************************//** +Does an insert operation by updating a delete-marked existing record +in the index. This situation can occur if the delete-marked record is +kept in the index for consistent reads. +@return DB_SUCCESS or error code */ +static +ulint +row_ins_sec_index_entry_by_modify( +/*==============================*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + const dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + big_rec_t* dummy_big_rec; + mem_heap_t* heap; + upd_t* update; + rec_t* rec; + ulint err; + + rec = btr_cur_get_rec(cursor); + + ut_ad(!dict_index_is_clust(cursor->index)); + ut_ad(rec_get_deleted_flag(rec, + dict_table_is_comp(cursor->index->table))); + + /* We know that in the alphabetical ordering, entry and rec are + identified. But in their binary form there may be differences if + there are char fields in them. Therefore we have to calculate the + difference. */ + + heap = mem_heap_create(1024); + + update = row_upd_build_sec_rec_difference_binary( + cursor->index, entry, rec, thr_get_trx(thr), heap); + if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor, + update, 0, thr, mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + + goto func_exit; + } + + err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, + &heap, &dummy_big_rec, update, + 0, thr, mtr); + ut_ad(!dummy_big_rec); + } +func_exit: + mem_heap_free(heap); + + return(err); +} + +/*******************************************************************//** +Does an insert operation by delete unmarking and updating a delete marked +existing record in the index. This situation can occur if the delete marked +record is kept in the index for consistent reads. +@return DB_SUCCESS, DB_FAIL, or error code */ +static +ulint +row_ins_clust_index_entry_by_modify( +/*================================*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + big_rec_t** big_rec,/*!< out: possible big rec vector of fields + which have to be stored externally by the + caller */ + const dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + rec_t* rec; + upd_t* update; + ulint err; + + ut_ad(dict_index_is_clust(cursor->index)); + + *big_rec = NULL; + + rec = btr_cur_get_rec(cursor); + + ut_ad(rec_get_deleted_flag(rec, + dict_table_is_comp(cursor->index->table))); + + if (!*heap) { + *heap = mem_heap_create(1024); + } + + /* Build an update vector containing all the fields to be modified; + NOTE that this vector may NOT contain system columns trx_id or + roll_ptr */ + + update = row_upd_build_difference_binary(cursor->index, entry, rec, + thr_get_trx(thr), *heap); + if (mode == BTR_MODIFY_LEAF) { + /* Try optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(0, cursor, update, 0, thr, + mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + + } + err = btr_cur_pessimistic_update(0, cursor, + heap, big_rec, update, + 0, thr, mtr); + } + + return(err); +} + +/*********************************************************************//** +Returns TRUE if in a cascaded update/delete an ancestor node of node +updates (not DELETE, but UPDATE) table. +@return TRUE if an ancestor updates table */ +static +ibool +row_ins_cascade_ancestor_updates_table( +/*===================================*/ + que_node_t* node, /*!< in: node in a query graph */ + dict_table_t* table) /*!< in: table */ +{ + que_node_t* parent; + upd_node_t* upd_node; + + parent = que_node_get_parent(node); + + while (que_node_get_type(parent) == QUE_NODE_UPDATE) { + + upd_node = parent; + + if (upd_node->table == table && upd_node->is_delete == FALSE) { + + return(TRUE); + } + + parent = que_node_get_parent(parent); + + ut_a(parent); + } + + return(FALSE); +} + +/*********************************************************************//** +Returns the number of ancestor UPDATE or DELETE nodes of a +cascaded update/delete node. +@return number of ancestors */ +static +ulint +row_ins_cascade_n_ancestors( +/*========================*/ + que_node_t* node) /*!< in: node in a query graph */ +{ + que_node_t* parent; + ulint n_ancestors = 0; + + parent = que_node_get_parent(node); + + while (que_node_get_type(parent) == QUE_NODE_UPDATE) { + n_ancestors++; + + parent = que_node_get_parent(parent); + + ut_a(parent); + } + + return(n_ancestors); +} + +/******************************************************************//** +Calculates the update vector node->cascade->update for a child table in +a cascaded update. +@return number of fields in the calculated update vector; the value +can also be 0 if no foreign key fields changed; the returned value is +ULINT_UNDEFINED if the column type in the child table is too short to +fit the new value in the parent table: that means the update fails */ +static +ulint +row_ins_cascade_calc_update_vec( +/*============================*/ + upd_node_t* node, /*!< in: update node of the parent + table */ + dict_foreign_t* foreign, /*!< in: foreign key constraint whose + type is != 0 */ + mem_heap_t* heap) /*!< in: memory heap to use as + temporary storage */ +{ + upd_node_t* cascade = node->cascade_node; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index = foreign->foreign_index; + upd_t* update; + upd_field_t* ufield; + dict_table_t* parent_table; + dict_index_t* parent_index; + upd_t* parent_update; + upd_field_t* parent_ufield; + ulint n_fields_updated; + ulint parent_field_no; + ulint i; + ulint j; + + ut_a(node); + ut_a(foreign); + ut_a(cascade); + ut_a(table); + ut_a(index); + + /* Calculate the appropriate update vector which will set the fields + in the child index record to the same value (possibly padded with + spaces if the column is a fixed length CHAR or FIXBINARY column) as + the referenced index record will get in the update. */ + + parent_table = node->table; + ut_a(parent_table == foreign->referenced_table); + parent_index = foreign->referenced_index; + parent_update = node->update; + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + n_fields_updated = 0; + + for (i = 0; i < foreign->n_fields; i++) { + + parent_field_no = dict_table_get_nth_col_pos( + parent_table, + dict_index_get_nth_col_no(parent_index, i)); + + for (j = 0; j < parent_update->n_fields; j++) { + parent_ufield = parent_update->fields + j; + + if (parent_ufield->field_no == parent_field_no) { + + ulint min_size; + const dict_col_t* col; + ulint ufield_len; + + col = dict_index_get_nth_col(index, i); + + /* A field in the parent index record is + updated. Let us make the update vector + field for the child table. */ + + ufield = update->fields + n_fields_updated; + + ufield->field_no + = dict_table_get_nth_col_pos( + table, dict_col_get_no(col)); + ufield->exp = NULL; + + ufield->new_val = parent_ufield->new_val; + ufield_len = dfield_get_len(&ufield->new_val); + + /* Clear the "external storage" flag */ + dfield_set_len(&ufield->new_val, ufield_len); + + /* Do not allow a NOT NULL column to be + updated as NULL */ + + if (dfield_is_null(&ufield->new_val) + && (col->prtype & DATA_NOT_NULL)) { + + return(ULINT_UNDEFINED); + } + + /* If the new value would not fit in the + column, do not allow the update */ + + if (!dfield_is_null(&ufield->new_val) + && dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminlen, col->mbmaxlen, + col->len, + ufield_len, + dfield_get_data(&ufield->new_val)) + < ufield_len) { + + return(ULINT_UNDEFINED); + } + + /* If the parent column type has a different + length than the child column type, we may + need to pad with spaces the new value of the + child column */ + + min_size = dict_col_get_min_size(col); + + /* Because UNIV_SQL_NULL (the marker + of SQL NULL values) exceeds all possible + values of min_size, the test below will + not hold for SQL NULL columns. */ + + if (min_size > ufield_len) { + + char* pad_start; + const char* pad_end; + char* padded_data + = mem_heap_alloc( + heap, min_size); + pad_start = padded_data + ufield_len; + pad_end = padded_data + min_size; + + memcpy(padded_data, + dfield_get_data(&ufield + ->new_val), + dfield_get_len(&ufield + ->new_val)); + + switch (UNIV_EXPECT(col->mbminlen,1)) { + default: + ut_error; + return(ULINT_UNDEFINED); + case 1: + if (UNIV_UNLIKELY + (dtype_get_charset_coll( + col->prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL)) { + /* Do not pad BINARY + columns. */ + return(ULINT_UNDEFINED); + } + + /* space=0x20 */ + memset(pad_start, 0x20, + pad_end - pad_start); + break; + case 2: + /* space=0x0020 */ + ut_a(!(ufield_len % 2)); + ut_a(!(min_size % 2)); + do { + *pad_start++ = 0x00; + *pad_start++ = 0x20; + } while (pad_start < pad_end); + break; + } + + dfield_set_data(&ufield->new_val, + padded_data, min_size); + } + + n_fields_updated++; + } + } + } + + update->n_fields = n_fields_updated; + + return(n_fields_updated); +} + +/*********************************************************************//** +Set detailed error message associated with foreign key errors for +the given transaction. */ +static +void +row_ins_set_detailed( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign) /*!< in: foreign key constraint */ +{ + mutex_enter(&srv_misc_tmpfile_mutex); + rewind(srv_misc_tmpfile); + + if (os_file_set_eof(srv_misc_tmpfile)) { + ut_print_name(srv_misc_tmpfile, trx, TRUE, + foreign->foreign_table_name); + dict_print_info_on_foreign_key_in_create_format( + srv_misc_tmpfile, trx, foreign, FALSE); + trx_set_detailed_error_from_file(trx, srv_misc_tmpfile); + } else { + trx_set_detailed_error(trx, "temp file operation failed"); + } + + mutex_exit(&srv_misc_tmpfile_mutex); +} + +/*********************************************************************//** +Reports a foreign key error associated with an update or a delete of a +parent table index entry. */ +static +void +row_ins_foreign_report_err( +/*=======================*/ + const char* errstr, /*!< in: error string from the viewpoint + of the parent table */ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!< in: a matching index record in the + child table */ + const dtuple_t* entry) /*!< in: index entry in the parent + table */ +{ + FILE* ef = dict_foreign_err_file; + trx_t* trx = thr_get_trx(thr); + + row_ins_set_detailed(trx, foreign); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx, 600); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign, + TRUE); + putc('\n', ef); + fputs(errstr, ef); + fputs(" in parent table, in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->referenced_index->name); + if (entry) { + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + } + fputs("\nBut in child table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->foreign_index->name); + if (rec) { + fputs(", there is a record:\n", ef); + rec_print(ef, rec, foreign->foreign_index); + } else { + fputs(", the record is not available\n", ef); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Reports a foreign key error to dict_foreign_err_file when we are trying +to add an index entry to a child table. Note that the adding may be the result +of an update, too. */ +static +void +row_ins_foreign_report_add_err( +/*===========================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!< in: a record in the parent table: + it does not match entry because we + have an error! */ + const dtuple_t* entry) /*!< in: index entry to insert in the + child table */ +{ + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx, 600); + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign, + TRUE); + fputs("\nTrying to add in child table, in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->foreign_index->name); + if (entry) { + fputs(" tuple:\n", ef); + /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized. + It would be better to only display the user columns. */ + dtuple_print(ef, entry); + } + fputs("\nBut in parent table ", ef); + ut_print_name(ef, trx, TRUE, foreign->referenced_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->referenced_index->name); + fputs(",\nthe closest match we can find is record:\n", ef); + if (rec && page_rec_is_supremum(rec)) { + /* If the cursor ended on a supremum record, it is better + to report the previous record in the error message, so that + the user gets a more descriptive error message. */ + rec = page_rec_get_prev_const(rec); + } + + if (rec) { + rec_print(ef, rec, foreign->referenced_index); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Invalidate the query cache for the given table. */ +static +void +row_ins_invalidate_query_cache( +/*===========================*/ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + const char* name) /*!< in: table name prefixed with + database name and a '/' character */ +{ + char* buf; + char* ptr; + ulint len = strlen(name) + 1; + + buf = mem_strdupl(name, len); + + ptr = strchr(buf, '/'); + ut_a(ptr); + *ptr = '\0'; + + innobase_invalidate_query_cache(thr_get_trx(thr), buf, len); + mem_free(buf); +} + +/*********************************************************************//** +Perform referential actions or checks when a parent row is deleted or updated +and the constraint had an ON DELETE or ON UPDATE condition which was not +RESTRICT. +@return DB_SUCCESS, DB_LOCK_WAIT, or error code */ +static +ulint +row_ins_foreign_check_on_constraint( +/*================================*/ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /*!< in: foreign key constraint whose + type is != 0 */ + btr_pcur_t* pcur, /*!< in: cursor placed on a matching + index record in the child table */ + dtuple_t* entry, /*!< in: index entry in the parent + table */ + mtr_t* mtr) /*!< in: mtr holding the latch of pcur + page */ +{ + upd_node_t* node; + upd_node_t* cascade; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index; + dict_index_t* clust_index; + dtuple_t* ref; + mem_heap_t* upd_vec_heap = NULL; + const rec_t* rec; + const rec_t* clust_rec; + const buf_block_t* clust_block; + upd_t* update; + ulint n_to_update; + ulint err; + ulint i; + trx_t* trx; + mem_heap_t* tmp_heap = NULL; + + ut_a(thr); + ut_a(foreign); + ut_a(pcur); + ut_a(mtr); + + trx = thr_get_trx(thr); + + /* Since we are going to delete or update a row, we have to invalidate + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the sync0sync.h rank above the kernel mutex. The query cache mutex has + a rank just above the kernel mutex. */ + + row_ins_invalidate_query_cache(thr, table->name); + + node = thr->run_node; + + if (node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_DELETE_CASCADE + | DICT_FOREIGN_ON_DELETE_SET_NULL))) { + + row_ins_foreign_report_err("Trying to delete", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (!node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_UPDATE_CASCADE + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* This is an UPDATE */ + + row_ins_foreign_report_err("Trying to update", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (node->cascade_node == NULL) { + /* Extend our query graph by creating a child to current + update node. The child is used in the cascade or set null + operation. */ + + node->cascade_heap = mem_heap_create(128); + node->cascade_node = row_create_update_node_for_mysql( + table, node->cascade_heap); + que_node_set_parent(node->cascade_node, node); + } + + /* Initialize cascade_node to do the operation we want. Note that we + use the SAME cascade node to do all foreign key operations of the + SQL DELETE: the table of the cascade node may change if there are + several child tables to the table where the delete is done! */ + + cascade = node->cascade_node; + + cascade->table = table; + + cascade->foreign = foreign; + + if (node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) { + cascade->is_delete = TRUE; + } else { + cascade->is_delete = FALSE; + + if (foreign->n_fields > cascade->update_n_fields) { + /* We have to make the update vector longer */ + + cascade->update = upd_create(foreign->n_fields, + node->cascade_heap); + cascade->update_n_fields = foreign->n_fields; + } + } + + /* We do not allow cyclic cascaded updating (DELETE is allowed, + but not UPDATE) of the same table, as this can lead to an infinite + cycle. Check that we are not updating the same table which is + already being modified in this cascade chain. We have to check + this also because the modification of the indexes of a 'parent' + table may still be incomplete, and we must avoid seeing the indexes + of the parent table in an inconsistent state! */ + + if (!cascade->is_delete + && row_ins_cascade_ancestor_updates_table(cascade, table)) { + + /* We do not know if this would break foreign key + constraints, but play safe and return an error */ + + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying an update, possibly causing a cyclic" + " cascaded update\n" + "in the child table,", thr, foreign, + btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (row_ins_cascade_n_ancestors(cascade) >= 15) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a too deep cascaded delete or update\n", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + index = btr_pcur_get_btr_cur(pcur)->index; + + ut_a(index == foreign->foreign_index); + + rec = btr_pcur_get_rec(pcur); + + if (dict_index_is_clust(index)) { + /* pcur is already positioned in the clustered index of + the child table */ + + clust_index = index; + clust_rec = rec; + clust_block = btr_pcur_get_block(pcur); + } else { + /* We have to look for the record in the clustered index + in the child table */ + + clust_index = dict_table_get_first_index(table); + + tmp_heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, + tmp_heap); + btr_pcur_open_with_no_init(clust_index, ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + cascade->pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(cascade->pcur); + clust_block = btr_pcur_get_block(cascade->pcur); + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(cascade->pcur) + < dict_index_get_n_unique(clust_index)) { + + fputs("InnoDB: error in cascade of a foreign key op\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + fputs("\n" + "InnoDB: clustered record ", stderr); + rec_print(stderr, clust_rec, clust_index); + fputs("\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com\n", stderr); + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Set an X-lock on the row to delete or update in the child table */ + + err = lock_table(0, table, LOCK_IX, thr); + + if (err == DB_SUCCESS) { + /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; + we already have a normal shared lock on the appropriate + gap if the search criterion was not unique */ + + err = lock_clust_rec_read_check_and_lock_alt( + 0, clust_block, clust_rec, clust_index, + LOCK_X, LOCK_REC_NOT_GAP, thr); + } + + if (err != DB_SUCCESS) { + + goto nonstandard_exit_func; + } + + if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) { + /* This can happen if there is a circular reference of + rows such that cascading delete comes to delete a row + already in the process of being delete marked */ + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + if ((node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)) + || (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* Build the appropriate update vector which sets + foreign->n_fields first fields in rec to SQL NULL */ + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + for (i = 0; i < foreign->n_fields; i++) { + upd_field_t* ufield = &update->fields[i]; + + ufield->field_no = dict_table_get_nth_col_pos( + table, + dict_index_get_nth_col_no(index, i)); + ufield->orig_len = 0; + ufield->exp = NULL; + dfield_set_null(&ufield->new_val); + } + } + + if (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) { + + /* Build the appropriate update vector which sets changing + foreign->n_fields first fields in rec to new values */ + + upd_vec_heap = mem_heap_create(256); + + n_to_update = row_ins_cascade_calc_update_vec(node, foreign, + upd_vec_heap); + if (n_to_update == ULINT_UNDEFINED) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a cascaded update where the" + " updated value in the child\n" + "table would not fit in the length" + " of the column, or the value would\n" + "be NULL and the column is" + " declared as not NULL in the child table,", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (cascade->update->n_fields == 0) { + + /* The update does not change any columns referred + to in this foreign key constraint: no need to do + anything */ + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Store pcur position and initialize or store the cascade node + pcur stored position */ + + btr_pcur_store_position(pcur, mtr); + + if (index == clust_index) { + btr_pcur_copy_stored_position(cascade->pcur, pcur); + } else { + btr_pcur_store_position(cascade->pcur, mtr); + } + + mtr_commit(mtr); + + ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON); + + cascade->state = UPD_NODE_UPDATE_CLUSTERED; + + err = row_update_cascade_for_mysql(thr, cascade, + foreign->foreign_table); + + if (foreign->foreign_table->n_foreign_key_checks_running == 0) { + fprintf(stderr, + "InnoDB: error: table %s has the counter 0" + " though there is\n" + "InnoDB: a FOREIGN KEY check running on it.\n", + foreign->foreign_table->name); + } + + /* Release the data dictionary latch for a while, so that we do not + starve other threads from doing CREATE TABLE etc. if we have a huge + cascaded operation running. The counter n_foreign_key_checks_running + will prevent other users from dropping or ALTERing the table when we + release the latch. */ + + row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); + row_mysql_freeze_data_dictionary(thr_get_trx(thr)); + + mtr_start(mtr); + + /* Restore pcur position */ + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + return(err); + +nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + btr_pcur_store_position(pcur, mtr); + + mtr_commit(mtr); + mtr_start(mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + return(err); +} + +/*********************************************************************//** +Sets a shared lock on a record. Used in locking possible duplicate key +records and also in checking foreign key constraints. +@return DB_SUCCESS or error code */ +static +ulint +row_ins_set_shared_rec_lock( +/*========================*/ + ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } + + return(err); +} + +/*********************************************************************//** +Sets a exclusive lock on a record. Used in locking possible duplicate key +records +@return DB_SUCCESS or error code */ +static +ulint +row_ins_set_exclusive_rec_lock( +/*===========================*/ + ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } + + return(err); +} + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_operation_lock. +@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ +UNIV_INTERN +ulint +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE if we want to check that + the referenced table is ok, FALSE if we + want to to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ +{ + upd_node_t* upd_node; + dict_table_t* check_table; + dict_index_t* check_index; + ulint n_fields_cmp; + btr_pcur_t pcur; + ibool moved; + int cmp; + ulint err; + ulint i; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + +run_again: +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + err = DB_SUCCESS; + + if (trx->check_foreigns == FALSE) { + /* The user has suppressed foreign key checks currently for + this session */ + goto exit_func; + } + + /* If any of the foreign key fields in entry is SQL NULL, we + suppress the foreign key check: this is compatible with Oracle, + for example */ + + for (i = 0; i < foreign->n_fields; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + goto exit_func; + } + } + + if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) { + upd_node = thr->run_node; + + if (!(upd_node->is_delete) && upd_node->foreign == foreign) { + /* If a cascaded update is done as defined by a + foreign key constraint, do not check that + constraint for the child row. In ON UPDATE CASCADE + the update of the parent row is only half done when + we come here: if we would check the constraint here + for the child row it would fail. + + A QUESTION remains: if in the child table there are + several constraints which refer to the same parent + table, we should merge all updates to the child as + one update? And the updates can be contradictory! + Currently we just perform the update associated + with each foreign key constraint, one after + another, and the user has problems predicting in + which order they are performed. */ + + goto exit_func; + } + } + + if (check_ref) { + check_table = foreign->referenced_table; + check_index = foreign->referenced_index; + } else { + check_table = foreign->foreign_table; + check_index = foreign->foreign_index; + } + + if (check_table == NULL || check_table->ibd_file_missing) { + if (check_ref) { + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx, 600); + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, + foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format( + ef, trx, foreign, TRUE); + fputs("\nTrying to add to index ", ef); + ut_print_name(ef, trx, FALSE, + foreign->foreign_index->name); + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + fputs("\nBut the parent table ", ef); + ut_print_name(ef, trx, TRUE, + foreign->referenced_table_name); + fputs("\nor its .ibd file does" + " not currently exist!\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_NO_REFERENCED_ROW; + } + + goto exit_func; + } + + ut_a(check_table); + ut_a(check_index); + + if (check_table != table) { + /* We already have a LOCK_IX on table, but not necessarily + on check_table */ + + err = lock_table(0, check_table, LOCK_IS, thr); + + if (err != DB_SUCCESS) { + + goto do_possible_lock_wait; + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, foreign->n_fields); + + btr_pcur_open(check_index, entry, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + /* Scan index records and check if there is a matching record */ + + for (;;) { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + + if (page_rec_is_infimum(rec)) { + + goto next_rec; + } + + offsets = rec_get_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, &heap); + + if (page_rec_is_supremum(rec)) { + + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block, + rec, check_index, + offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + + goto next_rec; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, block, + rec, check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + } else { + /* Found a matching record. Lock only + a record because we can allow inserts + into gaps */ + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, block, + rec, check_index, offsets, thr); + + if (err != DB_SUCCESS) { + + break; + } + + if (check_ref) { + err = DB_SUCCESS; + + break; + } else if (foreign->type != 0) { + /* There is an ON UPDATE or ON DELETE + condition: check them in a separate + function */ + + err = row_ins_foreign_check_on_constraint( + thr, foreign, &pcur, entry, + &mtr); + if (err != DB_SUCCESS) { + /* Since reporting a plain + "duplicate key" error + message to the user in + cases where a long CASCADE + operation would lead to a + duplicate key in some + other table is very + confusing, map duplicate + key errors resulting from + FK constraints to a + separate error code. */ + + if (err == DB_DUPLICATE_KEY) { + err = DB_FOREIGN_DUPLICATE_KEY; + } + + break; + } + + /* row_ins_foreign_check_on_constraint + may have repositioned pcur on a + different block */ + block = btr_pcur_get_block(&pcur); + } else { + row_ins_foreign_report_err( + "Trying to delete or update", + thr, foreign, rec, entry); + + err = DB_ROW_IS_REFERENCED; + break; + } + } + } + + if (cmp < 0) { + err = row_ins_set_shared_rec_lock( + LOCK_GAP, block, + rec, check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + + if (check_ref) { + err = DB_NO_REFERENCED_ROW; + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + } else { + err = DB_SUCCESS; + } + + break; + } + + ut_a(cmp == 0); +next_rec: + moved = btr_pcur_move_to_next(&pcur, &mtr); + + if (!moved) { + if (check_ref) { + rec = btr_pcur_get_rec(&pcur); + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + err = DB_NO_REFERENCED_ROW; + } else { + err = DB_SUCCESS; + } + + break; + } + } + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + +do_possible_lock_wait: + if (err == DB_LOCK_WAIT) { + trx->error_state = err; + + que_thr_stop_for_mysql(thr); + + srv_suspend_mysql_thread(thr); + + if (trx->error_state == DB_SUCCESS) { + + goto run_again; + } + + err = trx->error_state; + } + +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/***************************************************************//** +Checks if foreign key constraints fail for an index entry. If index +is not mentioned in any constraint, this function does nothing, +Otherwise does searches to the indexes of referenced tables and +sets shared locks which lock either the success or the failure of +a constraint. +@return DB_SUCCESS or error code */ +static +ulint +row_ins_check_foreign_constraints( +/*==============================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_foreign_t* foreign; + ulint err; + trx_t* trx; + ibool got_s_lock = FALSE; + + trx = thr_get_trx(thr); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign) { + if (foreign->foreign_index == index) { + + if (foreign->referenced_table == NULL) { + dict_table_get(foreign->referenced_table_name, + FALSE); + } + + if (0 == trx->dict_operation_lock_mode) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + if (foreign->referenced_table) { + mutex_enter(&(dict_sys->mutex)); + + (foreign->referenced_table + ->n_foreign_key_checks_running)++; + + mutex_exit(&(dict_sys->mutex)); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects the referenced + table from being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + TRUE, foreign, table, entry, thr); + + if (foreign->referenced_table) { + mutex_enter(&(dict_sys->mutex)); + + ut_a(foreign->referenced_table + ->n_foreign_key_checks_running > 0); + (foreign->referenced_table + ->n_foreign_key_checks_running)--; + + mutex_exit(&(dict_sys->mutex)); + } + + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + if (err != DB_SUCCESS) { + return(err); + } + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + return(DB_SUCCESS); +} + +/***************************************************************//** +Checks if a unique key violation to rec would occur at the index entry +insert. +@return TRUE if error */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + const rec_t* rec, /*!< in: user record; NOTE that we assume + that the caller already has a record lock on + the record! */ + const dtuple_t* entry, /*!< in: entry to insert */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint matched_fields; + ulint matched_bytes; + ulint n_unique; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + matched_bytes = 0; + + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + /* In a unique secondary index we allow equal key values if they + contain SQL NULLs */ + + if (!dict_index_is_clust(index)) { + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(FALSE); + } + } + } + + return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); +} + +/***************************************************************//** +Scans a unique non-clustered index at a given index entry to determine +whether a uniqueness violation has occurred for the key value of the entry. +Set shared locks on possible duplicate records. +@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */ +static +ulint +row_ins_scan_sec_index_for_duplicate( +/*=================================*/ + dict_index_t* index, /*!< in: non-clustered unique index */ + dtuple_t* entry, /*!< in: index entry */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint n_unique; + ulint i; + int cmp; + ulint n_fields_cmp; + btr_pcur_t pcur; + ulint err = DB_SUCCESS; + unsigned allow_duplicates; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique(index); + + /* If the secondary index is unique, but one of the fields in the + n_unique first fields is NULL, a unique key violation cannot occur, + since we define NULL != NULL in this case */ + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(DB_SUCCESS); + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index)); + + btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); + + allow_duplicates = thr_get_trx(thr)->duplicates & TRX_DUP_IGNORE; + + /* Scan index records and check if there is a duplicate */ + + do { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + + if (page_rec_is_infimum(rec)) { + + continue; + } + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (allow_duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_ORDINARY, block, + rec, index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, block, + rec, index, offsets, thr); + } + + if (err != DB_SUCCESS) { + + break; + } + + if (page_rec_is_supremum(rec)) { + + continue; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { + err = DB_DUPLICATE_KEY; + + thr_get_trx(thr)->error_info = index; + + break; + } + } + + if (cmp < 0) { + break; + } + + ut_a(cmp == 0); + } while (btr_pcur_move_to_next(&pcur, &mtr)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + + return(err); +} + +/***************************************************************//** +Checks if a unique key violation error would occur at an index entry +insert. Sets shared locks on possible duplicate records. Works only +for a clustered index! +@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error, +DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate +record */ +static +ulint +row_ins_duplicate_error_in_clust( +/*=============================*/ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + dtuple_t* entry, /*!< in: entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint err; + rec_t* rec; + ulint n_unique; + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + UT_NOT_USED(mtr); + + ut_a(dict_index_is_clust(cursor->index)); + ut_ad(dict_index_is_unique(cursor->index)); + + /* NOTE: For unique non-clustered indexes there may be any number + of delete marked records with the same value for the non-clustered + index key (remember multiversioning), and which differ only in + the row refererence part of the index record, containing the + clustered index key fields. For such a secondary index record, + to avoid race condition, we must FIRST do the insertion and after + that check that the uniqueness condition is not breached! */ + + /* NOTE: A problem is that in the B-tree node pointers on an + upper level may match more to the entry than the actual existing + user records on the leaf level. So, even if low_match would suggest + that a duplicate key violation may occur, this may not be the case. */ + + n_unique = dict_index_get_n_unique(cursor->index); + + if (cursor->low_match >= n_unique) { + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + /* We set a lock on the possible duplicate: this + is needed in logical logging of MySQL to make + sure that in roll-forward we get the same duplicate + errors as in original execution */ + + if (trx->duplicates & TRX_DUP_IGNORE) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), rec, + cursor->index, offsets, thr); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index, offsets)) { + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + } + } + + if (cursor->up_match >= n_unique) { + + rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + if (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + if (trx->duplicates & TRX_DUP_IGNORE) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index, offsets)) { + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + } + + ut_a(!dict_index_is_clust(cursor->index)); + /* This should never happen */ + } + + err = DB_SUCCESS; +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/***************************************************************//** +Checks if an index entry has long enough common prefix with an existing +record so that the intended insert of the entry must be changed to a modify of +the existing record. In the case of a clustered index, the prefix must be +n_unique fields long, and in the case of a secondary index, all fields must be +equal. +@return 0 if no update, ROW_INS_PREV if previous should be updated; +currently we do the search so that only the low_match record can match +enough to the search tuple, not the next record */ +UNIV_INLINE +ulint +row_ins_must_modify( +/*================*/ + btr_cur_t* cursor) /*!< in: B-tree cursor */ +{ + ulint enough_match; + rec_t* rec; + + /* NOTE: (compare to the note in row_ins_duplicate_error) Because node + pointers on upper levels of the B-tree may match more to entry than + to actual user records on the leaf level, we have to check if the + candidate record is actually a user record. In a clustered index + node pointers contain index->n_unique first fields, and in the case + of a secondary index, all fields of the index. */ + + enough_match = dict_index_get_n_unique_in_tree(cursor->index); + + if (cursor->low_match >= enough_match) { + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_infimum(rec)) { + + return(ROW_INS_PREV); + } + } + + return(0); +} + +/***************************************************************//** +Tries to insert an index entry to an index. If the index is clustered +and a record with the same unique key is found, the other record is +necessarily marked deleted by a committed transaction, or a unique key +violation error occurs. The delete marked record is then updated to an +existing record, and we must write an undo log record on the delete +marked record. If the index is secondary, and a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed, +or error code */ +static +ulint +row_ins_index_entry_low( +/*====================*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_cur_t cursor; + ulint ignore_sec_unique = 0; + ulint modify = 0; /* remove warning */ + rec_t* insert_rec; + rec_t* rec; + ulint err; + ulint n_unique; + big_rec_t* big_rec = NULL; + mtr_t mtr; + mem_heap_t* heap = NULL; + + log_free_check(); + + mtr_start(&mtr); + + cursor.thr = thr; + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (!(thr_get_trx(thr)->check_unique_secondary)) { + ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE; + } + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + mode | BTR_INSERT | ignore_sec_unique, + &cursor, 0, &mtr); + + if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { + /* The insertion was made to the insert buffer already during + the search: we are done */ + + err = DB_SUCCESS; + + goto function_exit; + } + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } +#endif + + n_unique = dict_index_get_n_unique(index); + + if (dict_index_is_unique(index) && (cursor.up_match >= n_unique + || cursor.low_match >= n_unique)) { + + if (dict_index_is_clust(index)) { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust( + &cursor, entry, thr, &mtr); + if (err != DB_SUCCESS) { + + goto function_exit; + } + } else { + mtr_commit(&mtr); + err = row_ins_scan_sec_index_for_duplicate( + index, entry, thr); + mtr_start(&mtr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion. */ + + btr_cur_search_to_nth_level(index, 0, entry, + PAGE_CUR_LE, + mode | BTR_INSERT, + &cursor, 0, &mtr); + } + } + + modify = row_ins_must_modify(&cursor); + + if (modify != 0) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + + if (modify == ROW_INS_NEXT) { + rec = page_rec_get_next(btr_cur_get_rec(&cursor)); + + btr_cur_position(index, rec, + btr_cur_get_block(&cursor),&cursor); + } + + if (dict_index_is_clust(index)) { + err = row_ins_clust_index_entry_by_modify( + mode, &cursor, &heap, &big_rec, entry, + thr, &mtr); + } else { + ut_ad(!n_ext); + err = row_ins_sec_index_entry_by_modify( + mode, &cursor, entry, thr, &mtr); + } + } else { + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert( + 0, &cursor, entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + + goto function_exit; + } + err = btr_cur_pessimistic_insert( + 0, &cursor, entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } + } + +function_exit: + mtr_commit(&mtr); + + if (UNIV_LIKELY_NULL(big_rec)) { + rec_t* rec; + ulint* offsets; + mtr_start(&mtr); + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, NULL, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr); + + if (modify) { + dtuple_big_rec_free(big_rec); + } else { + dtuple_convert_back_big_rec(index, entry, big_rec); + } + + mtr_commit(&mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/***************************************************************//** +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +ulint +row_ins_index_entry( +/*================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + ibool foreign,/*!< in: TRUE=check foreign key constraints */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + + if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) { + err = row_ins_check_foreign_constraints(index->table, index, + entry, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } + + /* Try first optimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, + n_ext, thr); + if (err != DB_FAIL) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, + n_ext, thr); + return(err); +} + +/***********************************************************//** +Sets the values of the dtuple fields in entry from the values of appropriate +columns in row. */ +static +void +row_ins_index_entry_set_vals( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to make */ + const dtuple_t* row) /*!< in: row */ +{ + ulint n_fields; + ulint i; + + ut_ad(entry && row); + + n_fields = dtuple_get_n_fields(entry); + + for (i = 0; i < n_fields; i++) { + dict_field_t* ind_field; + dfield_t* field; + const dfield_t* row_field; + ulint len; + + field = dtuple_get_nth_field(entry, i); + ind_field = dict_index_get_nth_field(index, i); + row_field = dtuple_get_nth_field(row, ind_field->col->ind); + len = dfield_get_len(row_field); + + /* Check column prefix indexes */ + if (ind_field->prefix_len > 0 + && dfield_get_len(row_field) != UNIV_SQL_NULL) { + + const dict_col_t* col + = dict_field_get_col(ind_field); + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ind_field->prefix_len, + len, dfield_get_data(row_field)); + + ut_ad(!dfield_is_ext(row_field)); + } + + dfield_set_data(field, dfield_get_data(row_field), len); + if (dfield_is_ext(row_field)) { + ut_ad(dict_index_is_clust(index)); + dfield_set_ext(field); + } + } +} + +/***********************************************************//** +Inserts a single index entry to the table. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +ulint +row_ins_index_entry_step( +/*=====================*/ + ins_node_t* node, /*!< in: row insert node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + + ut_ad(dtuple_check_typed(node->row)); + + row_ins_index_entry_set_vals(node->index, node->entry, node->row); + + ut_ad(dtuple_check_typed(node->entry)); + + err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr); + + return(err); +} + +/***********************************************************//** +Allocates a row id for row and inits the node->index field. */ +UNIV_INLINE +void +row_ins_alloc_row_id_step( +/*======================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + dulint row_id; + + ut_ad(node->state == INS_NODE_ALLOC_ROW_ID); + + if (dict_index_is_unique(dict_table_get_first_index(node->table))) { + + /* No row id is stored if the clustered index is unique */ + + return; + } + + /* Fill in row id value to row */ + + row_id = dict_sys_get_new_row_id(); + + dict_sys_write_row_id(node->row_id_buf, row_id); +} + +/***********************************************************//** +Gets a row to insert from the values list. */ +UNIV_INLINE +void +row_ins_get_row_from_values( +/*========================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->values_list; + + while (list_node) { + eval_exp(list_node); + + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/***********************************************************//** +Gets a row to insert from the select list. */ +UNIV_INLINE +void +row_ins_get_row_from_select( +/*========================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->select->select_list; + + while (list_node) { + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/***********************************************************//** +Inserts a row to a table. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +ulint +row_ins( +/*====*/ + ins_node_t* node, /*!< in: row insert node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + + if (node->state == INS_NODE_ALLOC_ROW_ID) { + + row_ins_alloc_row_id_step(node); + + node->index = dict_table_get_first_index(node->table); + node->entry = UT_LIST_GET_FIRST(node->entry_list); + + if (node->ins_type == INS_SEARCHED) { + + row_ins_get_row_from_select(node); + + } else if (node->ins_type == INS_VALUES) { + + row_ins_get_row_from_values(node); + } + + node->state = INS_NODE_INSERT_ENTRIES; + } + + ut_ad(node->state == INS_NODE_INSERT_ENTRIES); + + while (node->index != NULL) { + err = row_ins_index_entry_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + } + + ut_ad(node->entry == NULL); + + node->state = INS_NODE_ALLOC_ROW_ID; + + return(DB_SUCCESS); +} + +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ins_node_t* node; + que_node_t* parent; + sel_node_t* sel_node; + trx_t* trx; + ulint err; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started(trx); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); + + parent = que_node_get_parent(node); + sel_node = node->select; + + if (thr->prev_node == parent) { + node->state = INS_NODE_SET_IX_LOCK; + } + + /* If this is the first time this node is executed (or when + execution resumes after wait for the table IX lock), set an + IX lock on the table and reset the possible select node. MySQL's + partitioned table code may also call an insert within the same + SQL statement AFTER it has used this table handle to do a search. + This happens, for example, when a row update moves it to another + partition. In that case, we have already set the IX lock on the + table during the search operation, and there is no need to set + it again here. But we must write trx->id to node->trx_id_buf. */ + + trx_write_trx_id(node->trx_id_buf, trx->id); + + if (node->state == INS_NODE_SET_IX_LOCK) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + if (UT_DULINT_EQ(trx->id, node->trx_id)) { + /* No need to do IX-locking */ + + goto same_trx; + } + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + + node->trx_id = trx->id; +same_trx: + node->state = INS_NODE_ALLOC_ROW_ID; + + if (node->ins_type == INS_SEARCHED) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to insert */ + + thr->run_node = sel_node; + + return(thr); + } + } + + if ((node->ins_type == INS_SEARCHED) + && (sel_node->state != SEL_NODE_FETCH)) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to insert */ + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_ins(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* err == DB_LOCK_WAIT or SQL error detected */ + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->ins_type == INS_SEARCHED) { + /* Fetch a row to insert */ + + thr->run_node = sel_node; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c new file mode 100644 index 00000000000..05a45dc647c --- /dev/null +++ b/storage/innobase/row/row0merge.c @@ -0,0 +1,2364 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0merge.c +New index creation routines using a merge sort + +Created 12/4/2005 Jan Lindstrom +Completed by Sunny Bains and Marko Makela +*******************************************************/ + +#include "row0merge.h" +#include "row0ext.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0ins.h" +#include "row0sel.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "dict0load.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "os0file.h" +#include "lock0lock.h" +#include "data0data.h" +#include "data0type.h" +#include "que0que.h" +#include "pars0pars.h" +#include "mem0mem.h" +#include "log0log.h" +#include "ut0sort.h" +#include "handler0alter.h" + +#ifdef UNIV_DEBUG +/** Set these in order ot enable debug printout. */ +/* @{ */ +static ibool row_merge_print_cmp; +static ibool row_merge_print_read; +static ibool row_merge_print_write; +/* @} */ +#endif /* UNIV_DEBUG */ + +/** @brief Block size for I/O operations in merge sort. + +The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty() +rounded to a power of 2. + +When not creating a PRIMARY KEY that contains column prefixes, this +can be set as small as UNIV_PAGE_SIZE / 2. See the comment above +ut_ad(data_size < sizeof(row_merge_block_t)). */ +typedef byte row_merge_block_t[1048576]; + +/** @brief Secondary buffer for I/O operations of merge records. + +This buffer is used for writing or reading a record that spans two +row_merge_block_t. Thus, it must be able to hold one merge record, +whose maximum size is the same as the minimum size of +row_merge_block_t. */ +typedef byte mrec_buf_t[UNIV_PAGE_SIZE]; + +/** @brief Merge record in row_merge_block_t. + +The format is the same as a record in ROW_FORMAT=COMPACT with the +exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ +typedef byte mrec_t; + +/** Buffer for sorting in main memory. */ +struct row_merge_buf_struct { + mem_heap_t* heap; /*!< memory heap where allocated */ + dict_index_t* index; /*!< the index the tuples belong to */ + ulint total_size; /*!< total amount of data bytes */ + ulint n_tuples; /*!< number of data tuples */ + ulint max_tuples; /*!< maximum number of data tuples */ + const dfield_t**tuples; /*!< array of pointers to + arrays of fields that form + the data tuples */ + const dfield_t**tmp_tuples; /*!< temporary copy of tuples, + for sorting */ +}; + +/** Buffer for sorting in main memory. */ +typedef struct row_merge_buf_struct row_merge_buf_t; + +/** Information about temporary files used in merge sort */ +struct merge_file_struct { + int fd; /*!< file descriptor */ + ulint offset; /*!< file offset */ +}; + +/** Information about temporary files used in merge sort */ +typedef struct merge_file_struct merge_file_t; + +#ifdef UNIV_DEBUG +/******************************************************//** +Display a merge tuple. */ +static +void +row_merge_tuple_print( +/*==================*/ + FILE* f, /*!< in: output stream */ + const dfield_t* entry, /*!< in: tuple to print */ + ulint n_fields)/*!< in: number of fields in the tuple */ +{ + ulint j; + + for (j = 0; j < n_fields; j++) { + const dfield_t* field = &entry[j]; + + if (dfield_is_null(field)) { + fputs("\n NULL;", f); + } else { + ulint field_len = dfield_get_len(field); + ulint len = ut_min(field_len, 20); + if (dfield_is_ext(field)) { + fputs("\nE", f); + } else { + fputs("\n ", f); + } + ut_print_buf(f, dfield_get_data(field), len); + if (len != field_len) { + fprintf(f, " (total %lu bytes)", field_len); + } + } + } + putc('\n', f); +} +#endif /* UNIV_DEBUG */ + +/******************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +static +row_merge_buf_t* +row_merge_buf_create_low( +/*=====================*/ + mem_heap_t* heap, /*!< in: heap where allocated */ + dict_index_t* index, /*!< in: secondary index */ + ulint max_tuples, /*!< in: maximum number of data tuples */ + ulint buf_size) /*!< in: size of the buffer, in bytes */ +{ + row_merge_buf_t* buf; + + ut_ad(max_tuples > 0); + ut_ad(max_tuples <= sizeof(row_merge_block_t)); + ut_ad(max_tuples < buf_size); + + buf = mem_heap_zalloc(heap, buf_size); + buf->heap = heap; + buf->index = index; + buf->max_tuples = max_tuples; + buf->tuples = mem_heap_alloc(heap, + 2 * max_tuples * sizeof *buf->tuples); + buf->tmp_tuples = buf->tuples + max_tuples; + + return(buf); +} + +/******************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +static +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ +{ + row_merge_buf_t* buf; + ulint max_tuples; + ulint buf_size; + mem_heap_t* heap; + + max_tuples = sizeof(row_merge_block_t) + / ut_max(1, dict_index_get_min_size(index)); + + buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples; + + heap = mem_heap_create(buf_size + sizeof(row_merge_block_t)); + + buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size); + + return(buf); +} + +/******************************************************//** +Empty a sort buffer. +@return sort buffer */ +static +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ +{ + ulint buf_size; + ulint max_tuples = buf->max_tuples; + mem_heap_t* heap = buf->heap; + dict_index_t* index = buf->index; + + buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples; + + mem_heap_empty(heap); + + return(row_merge_buf_create_low(heap, index, max_tuples, buf_size)); +} + +/******************************************************//** +Deallocate a sort buffer. */ +static +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */ +{ + mem_heap_free(buf->heap); +} + +/******************************************************//** +Insert a data tuple into a sort buffer. +@return TRUE if added, FALSE if out of space */ +static +ibool +row_merge_buf_add( +/*==============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + const dtuple_t* row, /*!< in: row in clustered index */ + const row_ext_t* ext) /*!< in: cache of externally stored + column prefixes, or NULL */ +{ + ulint i; + ulint n_fields; + ulint data_size; + ulint extra_size; + const dict_index_t* index; + dfield_t* entry; + dfield_t* field; + + if (buf->n_tuples >= buf->max_tuples) { + return(FALSE); + } + + UNIV_PREFETCH_R(row->fields); + + index = buf->index; + + n_fields = dict_index_get_n_fields(index); + + entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry); + buf->tuples[buf->n_tuples] = entry; + field = entry; + + data_size = 0; + extra_size = UT_BITS_IN_BYTES(index->n_nullable); + + for (i = 0; i < n_fields; i++, field++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint col_no; + const dfield_t* row_field; + ulint len; + + ifield = dict_index_get_nth_field(index, i); + col = ifield->col; + col_no = dict_col_get_no(col); + row_field = dtuple_get_nth_field(row, col_no); + dfield_copy(field, row_field); + len = dfield_get_len(field); + + if (dfield_is_null(field)) { + ut_ad(!(col->prtype & DATA_NOT_NULL)); + continue; + } else if (UNIV_LIKELY(!ext)) { + } else if (dict_index_is_clust(index)) { + /* Flag externally stored fields. */ + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + if (i < dict_index_get_n_unique(index)) { + dfield_set_data(field, buf, len); + } else { + dfield_set_ext(field); + len = dfield_get_len(field); + } + } + } else { + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + dfield_set_data(field, buf, len); + } + } + + /* If a column prefix index, take only the prefix */ + + if (ifield->prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminlen, col->mbmaxlen, + ifield->prefix_len, + len, dfield_get_data(field)); + dfield_set_len(field, len); + } + + ut_ad(len <= col->len || col->mtype == DATA_BLOB); + + if (ifield->fixed_len) { + ut_ad(len == ifield->fixed_len); + ut_ad(!dfield_is_ext(field)); + } else if (dfield_is_ext(field)) { + extra_size += 2; + } else if (len < 128 + || (col->len < 256 && col->mtype != DATA_BLOB)) { + extra_size++; + } else { + /* For variable-length columns, we look up the + maximum length from the column itself. If this + is a prefix index column shorter than 256 bytes, + this will waste one byte. */ + extra_size += 2; + } + data_size += len; + } + +#ifdef UNIV_DEBUG + { + ulint size; + ulint extra; + + size = rec_get_converted_size_comp(index, + REC_STATUS_ORDINARY, + entry, n_fields, &extra); + + ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size); + ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra); + } +#endif /* UNIV_DEBUG */ + + /* Add to the total size of the record in row_merge_block_t + the encoded length of extra_size and the extra bytes (extra_size). + See row_merge_buf_write() for the variable-length encoding + of extra_size. */ + data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); + + /* The following assertion may fail if row_merge_block_t is + declared very small and a PRIMARY KEY is being created with + many prefix columns. In that case, the record may exceed the + page_zip_rec_needs_ext() limit. However, no further columns + will be moved to external storage until the record is inserted + to the clustered index B-tree. */ + ut_ad(data_size < sizeof(row_merge_block_t)); + + /* Reserve one byte for the end marker of row_merge_block_t. */ + if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) { + return(FALSE); + } + + buf->total_size += data_size; + buf->n_tuples++; + + field = entry; + + /* Copy the data fields. */ + + do { + dfield_dup(field++, buf->heap); + } while (--n_fields); + + return(TRUE); +} + +/** Structure for reporting duplicate records. */ +struct row_merge_dup_struct { + const dict_index_t* index; /*!< index being sorted */ + TABLE* table; /*!< MySQL table object */ + ulint n_dup; /*!< number of duplicates */ +}; + +/** Structure for reporting duplicate records. */ +typedef struct row_merge_dup_struct row_merge_dup_t; + +/*************************************************************//** +Report a duplicate key. */ +static +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ +{ + mrec_buf_t buf; + const dtuple_t* tuple; + dtuple_t tuple_store; + const rec_t* rec; + const dict_index_t* index = dup->index; + ulint n_fields= dict_index_get_n_fields(index); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + ulint n_ext; + + if (dup->n_dup++) { + /* Only report the first duplicate record, + but count all duplicate records. */ + return; + } + + rec_offs_init(offsets_); + + /* Convert the tuple to a record and then to MySQL format. */ + + tuple = dtuple_from_fields(&tuple_store, entry, n_fields); + n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0; + + rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext); + offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, + &heap); + + innobase_rec_to_mysql(dup->table, rec, index, offsets); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*************************************************************//** +Compare two tuples. +@return 1, 0, -1 if a is greater, equal, less, respectively, than b */ +static +int +row_merge_tuple_cmp( +/*================*/ + ulint n_field,/*!< in: number of fields */ + const dfield_t* a, /*!< in: first tuple to be compared */ + const dfield_t* b, /*!< in: second tuple to be compared */ + row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */ +{ + int cmp; + const dfield_t* field = a; + + /* Compare the fields of the tuples until a difference is + found or we run out of fields to compare. If !cmp at the + end, the tuples are equal. */ + do { + cmp = cmp_dfield_dfield(a++, b++); + } while (!cmp && --n_field); + + if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) { + /* Report a duplicate value error if the tuples are + logically equal. NULL columns are logically inequal, + although they are equal in the sorting order. Find + out if any of the fields are NULL. */ + for (b = field; b != a; b++) { + if (dfield_is_null(b)) { + + goto func_exit; + } + } + + row_merge_dup_report(dup, field); + } + +func_exit: + return(cmp); +} + +/** Wrapper for row_merge_tuple_sort() to inject some more context to +UT_SORT_FUNCTION_BODY(). +@param a array of tuples that being sorted +@param b aux (work area), same size as tuples[] +@param c lower bound of the sorting area, inclusive +@param d upper bound of the sorting area, inclusive */ +#define row_merge_tuple_sort_ctx(a,b,c,d) \ + row_merge_tuple_sort(n_field, dup, a, b, c, d) +/** Wrapper for row_merge_tuple_cmp() to inject some more context to +UT_SORT_FUNCTION_BODY(). +@param a first tuple to be compared +@param b second tuple to be compared +@return 1, 0, -1 if a is greater, equal, less, respectively, than b */ +#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup) + +/**********************************************************************//** +Merge sort the tuple buffer in main memory. */ +static +void +row_merge_tuple_sort( +/*=================*/ + ulint n_field,/*!< in: number of fields */ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t** tuples, /*!< in/out: tuples */ + const dfield_t** aux, /*!< in/out: work area */ + ulint low, /*!< in: lower bound of the + sorting area, inclusive */ + ulint high) /*!< in: upper bound of the + sorting area, exclusive */ +{ + UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx, + tuples, aux, low, high, row_merge_tuple_cmp_ctx); +} + +/******************************************************//** +Sort a buffer. */ +static +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */ +{ + row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup, + buf->tuples, buf->tmp_tuples, 0, buf->n_tuples); +} + +/******************************************************//** +Write a buffer to a block. */ +static +void +row_merge_buf_write( +/*================*/ + const row_merge_buf_t* buf, /*!< in: sorted buffer */ +#ifdef UNIV_DEBUG + const merge_file_t* of, /*!< in: output file */ +#endif /* UNIV_DEBUG */ + row_merge_block_t* block) /*!< out: buffer for writing to file */ +#ifndef UNIV_DEBUG +# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block) +#endif /* !UNIV_DEBUG */ +{ + const dict_index_t* index = buf->index; + ulint n_fields= dict_index_get_n_fields(index); + byte* b = &(*block)[0]; + + ulint i; + + for (i = 0; i < buf->n_tuples; i++) { + ulint size; + ulint extra_size; + const dfield_t* entry = buf->tuples[i]; + + size = rec_get_converted_size_comp(index, + REC_STATUS_ORDINARY, + entry, n_fields, + &extra_size); + ut_ad(size > extra_size); + ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES); + extra_size -= REC_N_NEW_EXTRA_BYTES; + size -= REC_N_NEW_EXTRA_BYTES; + + /* Encode extra_size + 1 */ + if (extra_size + 1 < 0x80) { + *b++ = (byte) (extra_size + 1); + } else { + ut_ad((extra_size + 1) < 0x8000); + *b++ = (byte) (0x80 | ((extra_size + 1) >> 8)); + *b++ = (byte) (extra_size + 1); + } + + ut_ad(b + size < block[1]); + + rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index, + REC_STATUS_ORDINARY, + entry, n_fields); + + b += size; + +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu", + (void*) b, of->fd, (ulong) of->offset, + (ulong) i); + row_merge_tuple_print(stderr, entry, n_fields); + } +#endif /* UNIV_DEBUG */ + } + + /* Write an "end-of-chunk" marker. */ + ut_a(b < block[1]); + ut_a(b == block[0] + buf->total_size); + *b++ = 0; +#ifdef UNIV_DEBUG_VALGRIND + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, block[1] - b); +#endif /* UNIV_DEBUG_VALGRIND */ +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n", + (void*) b, of->fd, (ulong) of->offset); + } +#endif /* UNIV_DEBUG */ +} + +/******************************************************//** +Create a memory heap and allocate space for row_merge_rec_offsets(). +@return memory heap */ +static +mem_heap_t* +row_merge_heap_create( +/*==================*/ + const dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets1, /*!< out: offsets */ + ulint** offsets2) /*!< out: offsets */ +{ + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1); + + *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1); + *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2); + + (*offsets1)[0] = (*offsets2)[0] = i; + (*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index); + + return(heap); +} + +/**********************************************************************//** +Search an index object by name and column names. If several indexes match, +return the index with the max id. +@return matching index, NULL if not found */ +static +dict_index_t* +row_merge_dict_table_get_index( +/*===========================*/ + dict_table_t* table, /*!< in: table */ + const merge_index_def_t*index_def) /*!< in: index definition */ +{ + ulint i; + dict_index_t* index; + const char** column_names; + + column_names = mem_alloc(index_def->n_fields * sizeof *column_names); + + for (i = 0; i < index_def->n_fields; ++i) { + column_names[i] = index_def->fields[i].field_name; + } + + index = dict_table_get_index_by_max_id( + table, index_def->name, column_names, index_def->n_fields); + + mem_free((void*) column_names); + + return(index); +} + +/********************************************************************//** +Read a merge block from the file system. +@return TRUE if request was successful, FALSE if fail */ +static +ibool +row_merge_read( +/*===========*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read */ + row_merge_block_t* buf) /*!< out: data */ +{ + ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf; + ibool success; + + success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf, + (ulint) (ofs & 0xFFFFFFFF), + (ulint) (ofs >> 32), + sizeof *buf); + if (UNIV_UNLIKELY(!success)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: failed to read merge block at %llu\n", ofs); + } + + return(UNIV_LIKELY(success)); +} + +/********************************************************************//** +Read a merge block from the file system. +@return TRUE if request was successful, FALSE if fail */ +static +ibool +row_merge_write( +/*============*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write */ + const void* buf) /*!< in: data */ +{ + ib_uint64_t ofs = ((ib_uint64_t) offset) + * sizeof(row_merge_block_t); + + return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, + (ulint) (ofs & 0xFFFFFFFF), + (ulint) (ofs >> 32), + sizeof(row_merge_block_t)))); +} + +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +static +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + int fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + ulint* offsets)/*!< out: offsets of mrec */ +{ + ulint extra_size; + ulint data_size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= block[0]); + ut_ad(b < block[1]); + ut_ad(index); + ut_ad(foffs); + ut_ad(mrec); + ut_ad(offsets); + + ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index)); + + extra_size = *b++; + + if (UNIV_UNLIKELY(!extra_size)) { + /* End of list */ + *mrec = NULL; +#ifdef UNIV_DEBUG + if (row_merge_print_read) { + fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n", + (const void*) b, (const void*) block, + fd, (ulong) *foffs); + } +#endif /* UNIV_DEBUG */ + return(NULL); + } + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + if (UNIV_UNLIKELY(b >= block[1])) { + if (!row_merge_read(fd, ++(*foffs), block)) { +err_exit: + /* Signal I/O error. */ + *mrec = b; + return(NULL); + } + + /* Wrap around to the beginning of the buffer. */ + b = block[0]; + } + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *b++; + } + + /* Normalize extra_size. Above, value 0 signals "end of list". */ + extra_size--; + + /* Read the extra bytes. */ + + if (UNIV_UNLIKELY(b + extra_size >= block[1])) { + /* The record spans two blocks. Copy the entire record + to the auxiliary buffer and handle this as a special + case. */ + + avail_size = block[1] - b; + + memcpy(*buf, b, avail_size); + + if (!row_merge_read(fd, ++(*foffs), block)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = block[0]; + + /* Copy the record. */ + memcpy(*buf + avail_size, b, extra_size - avail_size); + b += extra_size - avail_size; + + *mrec = *buf + extra_size; + + rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets); + + data_size = rec_offs_data_size(offsets); + + /* These overflows should be impossible given that + records are much smaller than either buffer, and + the record starts near the beginning of each buffer. */ + ut_a(extra_size + data_size < sizeof *buf); + ut_a(b + data_size < block[1]); + + /* Copy the data bytes. */ + memcpy(*buf + extra_size, b, data_size); + b += data_size; + + goto func_exit; + } + + *mrec = b + extra_size; + + rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets); + + data_size = rec_offs_data_size(offsets); + ut_ad(extra_size + data_size < sizeof *buf); + + b += extra_size + data_size; + + if (UNIV_LIKELY(b < block[1])) { + /* The record fits entirely in the block. + This is the normal case. */ + goto func_exit; + } + + /* The record spans two blocks. Copy it to buf. */ + + b -= extra_size + data_size; + avail_size = block[1] - b; + memcpy(*buf, b, avail_size); + *mrec = *buf + extra_size; +#ifdef UNIV_DEBUG + /* We cannot invoke rec_offs_make_valid() here, because there + are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size. + Similarly, rec_offs_validate() would fail, because it invokes + rec_get_status(). */ + offsets[2] = (ulint) *mrec; + offsets[3] = (ulint) index; +#endif /* UNIV_DEBUG */ + + if (!row_merge_read(fd, ++(*foffs), block)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = block[0]; + + /* Copy the rest of the record. */ + memcpy(*buf + avail_size, b, extra_size + data_size - avail_size); + b += extra_size + data_size - avail_size; + +func_exit: +#ifdef UNIV_DEBUG + if (row_merge_print_read) { + fprintf(stderr, "row_merge_read %p,%p,%d,%lu ", + (const void*) b, (const void*) block, + fd, (ulong) *foffs); + rec_print_comp(stderr, *mrec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_DEBUG */ + + return(b); +} + +/********************************************************************//** +Write a merge record. */ +static +void +row_merge_write_rec_low( +/*====================*/ + byte* b, /*!< out: buffer */ + ulint e, /*!< in: encoded extra_size */ +#ifdef UNIV_DEBUG + ulint size, /*!< in: total size to write */ + int fd, /*!< in: file descriptor */ + ulint foffs, /*!< in: file offset */ +#endif /* UNIV_DEBUG */ + const mrec_t* mrec, /*!< in: record to write */ + const ulint* offsets)/*!< in: offsets of mrec */ +#ifndef UNIV_DEBUG +# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \ + row_merge_write_rec_low(b, e, mrec, offsets) +#endif /* !UNIV_DEBUG */ +{ +#ifdef UNIV_DEBUG + const byte* const end = b + size; + ut_ad(e == rec_offs_extra_size(offsets) + 1); + + if (row_merge_print_write) { + fprintf(stderr, "row_merge_write %p,%d,%lu ", + (void*) b, fd, (ulong) foffs); + rec_print_comp(stderr, mrec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_DEBUG */ + + if (e < 0x80) { + *b++ = (byte) e; + } else { + *b++ = (byte) (0x80 | (e >> 8)); + *b++ = (byte) e; + } + + memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets)); + ut_ad(b + rec_offs_size(offsets) == end); +} + +/********************************************************************//** +Write a merge record. +@return pointer to end of block, or NULL on error */ +static +byte* +row_merge_write_rec( +/*================*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + byte* b, /*!< in: pointer to end of block */ + int fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t* mrec, /*!< in: record to write */ + const ulint* offsets)/*!< in: offsets of mrec */ +{ + ulint extra_size; + ulint size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= block[0]); + ut_ad(b < block[1]); + ut_ad(mrec); + ut_ad(foffs); + ut_ad(mrec < block[0] || mrec > block[1]); + ut_ad(mrec < buf[0] || mrec > buf[1]); + + /* Normalize extra_size. Value 0 signals "end of list". */ + extra_size = rec_offs_extra_size(offsets) + 1; + + size = extra_size + (extra_size >= 0x80) + + rec_offs_data_size(offsets); + + if (UNIV_UNLIKELY(b + size >= block[1])) { + /* The record spans two blocks. + Copy it to the temporary buffer first. */ + avail_size = block[1] - b; + + row_merge_write_rec_low(buf[0], + extra_size, size, fd, *foffs, + mrec, offsets); + + /* Copy the head of the temporary buffer, write + the completed block, and copy the tail of the + record to the head of the new block. */ + memcpy(b, buf[0], avail_size); + + if (!row_merge_write(fd, (*foffs)++, block)) { + return(NULL); + } + + UNIV_MEM_INVALID(block[0], sizeof block[0]); + + /* Copy the rest. */ + b = block[0]; + memcpy(b, buf[0] + avail_size, size - avail_size); + b += size - avail_size; + } else { + row_merge_write_rec_low(b, extra_size, size, fd, *foffs, + mrec, offsets); + b += size; + } + + return(b); +} + +/********************************************************************//** +Write an end-of-list marker. +@return pointer to end of block, or NULL on error */ +static +byte* +row_merge_write_eof( +/*================*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + byte* b, /*!< in: pointer to end of block */ + int fd, /*!< in: file descriptor */ + ulint* foffs) /*!< in/out: file offset */ +{ + ut_ad(block); + ut_ad(b >= block[0]); + ut_ad(b < block[1]); + ut_ad(foffs); +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n", + (void*) b, (void*) block, fd, (ulong) *foffs); + } +#endif /* UNIV_DEBUG */ + + *b++ = 0; + UNIV_MEM_ASSERT_RW(block[0], b - block[0]); + UNIV_MEM_ASSERT_W(block[0], sizeof block[0]); +#ifdef UNIV_DEBUG_VALGRIND + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, block[1] - b); +#endif /* UNIV_DEBUG_VALGRIND */ + + if (!row_merge_write(fd, (*foffs)++, block)) { + return(NULL); + } + + UNIV_MEM_INVALID(block[0], sizeof block[0]); + return(block[0]); +} + +/*************************************************************//** +Compare two merge records. +@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */ +static +int +row_merge_cmp( +/*==========*/ + const mrec_t* mrec1, /*!< in: first merge + record to be compared */ + const mrec_t* mrec2, /*!< in: second merge + record to be compared */ + const ulint* offsets1, /*!< in: first record offsets */ + const ulint* offsets2, /*!< in: second record offsets */ + const dict_index_t* index) /*!< in: index */ +{ + int cmp; + + cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index); + +#ifdef UNIV_DEBUG + if (row_merge_print_cmp) { + fputs("row_merge_cmp1 ", stderr); + rec_print_comp(stderr, mrec1, offsets1); + fputs("\nrow_merge_cmp2 ", stderr); + rec_print_comp(stderr, mrec2, offsets2); + fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp); + } +#endif /* UNIV_DEBUG */ + + return(cmp); +} + +/********************************************************************//** +Reads clustered index of the table and create temporary files +containing the index entries for the indexes to be built. +@return DB_SUCCESS or error */ +static +ulint +row_merge_read_clustered_index( +/*===========================*/ + trx_t* trx, /*!< in: transaction */ + TABLE* table, /*!< in/out: MySQL table object, + for reporting erroneous records */ + const dict_table_t* old_table,/*!< in: table where rows are + read from */ + const dict_table_t* new_table,/*!< in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + dict_index_t** index, /*!< in: indexes to be created */ + merge_file_t* files, /*!< in: temporary files */ + ulint n_index,/*!< in: number of indexes to create */ + row_merge_block_t* block) /*!< in/out: file buffer */ +{ + dict_index_t* clust_index; /* Clustered index */ + mem_heap_t* row_heap; /* Heap memory to create + clustered index records */ + row_merge_buf_t** merge_buf; /* Temporary list for records*/ + btr_pcur_t pcur; /* Persistent cursor on the + clustered index */ + mtr_t mtr; /* Mini transaction */ + ulint err = DB_SUCCESS;/* Return code */ + ulint i; + ulint n_nonnull = 0; /* number of columns + changed to NOT NULL */ + ulint* nonnull = NULL; /* NOT NULL columns */ + + trx->op_info = "reading clustered index"; + + ut_ad(trx); + ut_ad(old_table); + ut_ad(new_table); + ut_ad(index); + ut_ad(files); + + /* Create and initialize memory for record buffers */ + + merge_buf = mem_alloc(n_index * sizeof *merge_buf); + + for (i = 0; i < n_index; i++) { + merge_buf[i] = row_merge_buf_create(index[i]); + } + + mtr_start(&mtr); + + /* Find the clustered index and create a persistent cursor + based on that. */ + + clust_index = dict_table_get_first_index(old_table); + + btr_pcur_open_at_index_side( + TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + + if (UNIV_UNLIKELY(old_table != new_table)) { + ulint n_cols = dict_table_get_n_cols(old_table); + + /* A primary key will be created. Identify the + columns that were flagged NOT NULL in the new table, + so that we can quickly check that the records in the + (old) clustered index do not violate the added NOT + NULL constraints. */ + + ut_a(n_cols == dict_table_get_n_cols(new_table)); + + nonnull = mem_alloc(n_cols * sizeof *nonnull); + + for (i = 0; i < n_cols; i++) { + if (dict_table_get_nth_col(old_table, i)->prtype + & DATA_NOT_NULL) { + + continue; + } + + if (dict_table_get_nth_col(new_table, i)->prtype + & DATA_NOT_NULL) { + + nonnull[n_nonnull++] = i; + } + } + + if (!n_nonnull) { + mem_free(nonnull); + nonnull = NULL; + } + } + + row_heap = mem_heap_create(sizeof(mrec_buf_t)); + + /* Scan the clustered index. */ + for (;;) { + const rec_t* rec; + ulint* offsets; + dtuple_t* row = NULL; + row_ext_t* ext; + ibool has_next = TRUE; + + btr_pcur_move_to_next_on_page(&pcur); + + /* When switching pages, commit the mini-transaction + in order to release the latch on the old page. */ + + if (btr_pcur_is_after_last_on_page(&pcur)) { + btr_pcur_store_position(&pcur, &mtr); + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_SEARCH_LEAF, + &pcur, &mtr); + has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + if (UNIV_LIKELY(has_next)) { + rec = btr_pcur_get_rec(&pcur); + offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &row_heap); + + /* Skip delete marked records. */ + if (rec_get_deleted_flag( + rec, dict_table_is_comp(old_table))) { + continue; + } + + srv_n_rows_inserted++; + + /* Build a row based on the clustered index. */ + + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, offsets, + new_table, &ext, row_heap); + + if (UNIV_LIKELY_NULL(nonnull)) { + for (i = 0; i < n_nonnull; i++) { + dfield_t* field + = &row->fields[nonnull[i]]; + dtype_t* field_type + = dfield_get_type(field); + + ut_a(!(field_type->prtype + & DATA_NOT_NULL)); + + if (dfield_is_null(field)) { + err = DB_PRIMARY_KEY_IS_NULL; + i = 0; + goto err_exit; + } + + field_type->prtype |= DATA_NOT_NULL; + } + } + } + + /* Build all entries for all the indexes to be created + in a single scan of the clustered index. */ + + for (i = 0; i < n_index; i++) { + row_merge_buf_t* buf = merge_buf[i]; + merge_file_t* file = &files[i]; + const dict_index_t* index = buf->index; + + if (UNIV_LIKELY + (row && row_merge_buf_add(buf, row, ext))) { + continue; + } + + /* The buffer must be sufficiently large + to hold at least one record. */ + ut_ad(buf->n_tuples || !has_next); + + /* We have enough data tuples to form a block. + Sort them and write to disk. */ + + if (buf->n_tuples) { + if (dict_index_is_unique(index)) { + row_merge_dup_t dup; + dup.index = buf->index; + dup.table = table; + dup.n_dup = 0; + + row_merge_buf_sort(buf, &dup); + + if (dup.n_dup) { + err = DB_DUPLICATE_KEY; +err_exit: + trx->error_key_num = i; + goto func_exit; + } + } else { + row_merge_buf_sort(buf, NULL); + } + } + + row_merge_buf_write(buf, file, block); + + if (!row_merge_write(file->fd, file->offset++, + block)) { + err = DB_OUT_OF_FILE_SPACE; + goto err_exit; + } + + UNIV_MEM_INVALID(block[0], sizeof block[0]); + merge_buf[i] = row_merge_buf_empty(buf); + + /* Try writing the record again, now that + the buffer has been written out and emptied. */ + + if (UNIV_UNLIKELY + (row && !row_merge_buf_add(buf, row, ext))) { + /* An empty buffer should have enough + room for at least one record. */ + ut_error; + } + } + + mem_heap_empty(row_heap); + + if (UNIV_UNLIKELY(!has_next)) { + goto func_exit; + } + } + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(row_heap); + + if (UNIV_LIKELY_NULL(nonnull)) { + mem_free(nonnull); + } + + for (i = 0; i < n_index; i++) { + row_merge_buf_free(merge_buf[i]); + } + + mem_free(merge_buf); + + trx->op_info = ""; + + return(err); +} + +/** Write a record via buffer 2 and read the next record to buffer N. +@param N number of the buffer (0 or 1) +@param AT_END statement to execute at end of input */ +#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \ + do { \ + b2 = row_merge_write_rec(&block[2], &buf[2], b2, \ + of->fd, &of->offset, \ + mrec##N, offsets##N); \ + if (UNIV_UNLIKELY(!b2)) { \ + goto corrupt; \ + } \ + b##N = row_merge_read_rec(&block[N], &buf[N], \ + b##N, index, \ + file->fd, foffs##N, \ + &mrec##N, offsets##N); \ + if (UNIV_UNLIKELY(!b##N)) { \ + if (mrec##N) { \ + goto corrupt; \ + } \ + AT_END; \ + } \ + } while (0) + +/*************************************************************//** +Merge two blocks of linked lists on disk and write a bigger block. +@return DB_SUCCESS or error code */ +static +ulint +row_merge_blocks( +/*=============*/ + const dict_index_t* index, /*!< in: index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + ulint* foffs0, /*!< in/out: offset of first + source list in the file */ + ulint* foffs1, /*!< in/out: offset of second + source list in the file */ + merge_file_t* of, /*!< in/out: output file */ + TABLE* table) /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ + + mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */ + const byte* b0; /*!< pointer to block[0] */ + const byte* b1; /*!< pointer to block[1] */ + byte* b2; /*!< pointer to block[2] */ + const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */ + const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */ + ulint* offsets0;/* offsets of mrec0 */ + ulint* offsets1;/* offsets of mrec1 */ + + heap = row_merge_heap_create(index, &offsets0, &offsets1); + + /* Write a record and read the next record. Split the output + file in two halves, which can be merged on the following pass. */ + + if (!row_merge_read(file->fd, *foffs0, &block[0]) + || !row_merge_read(file->fd, *foffs1, &block[1])) { +corrupt: + mem_heap_free(heap); + return(DB_CORRUPTION); + } + + b0 = block[0]; + b1 = block[1]; + b2 = block[2]; + + b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd, + foffs0, &mrec0, offsets0); + b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd, + foffs1, &mrec1, offsets1); + if (UNIV_UNLIKELY(!b0 && mrec0) + || UNIV_UNLIKELY(!b1 && mrec1)) { + + goto corrupt; + } + + while (mrec0 && mrec1) { + switch (row_merge_cmp(mrec0, mrec1, + offsets0, offsets1, index)) { + case 0: + if (UNIV_UNLIKELY + (dict_index_is_unique(index))) { + innobase_rec_to_mysql(table, mrec0, + index, offsets0); + mem_heap_free(heap); + return(DB_DUPLICATE_KEY); + } + /* fall through */ + case -1: + ROW_MERGE_WRITE_GET_NEXT(0, goto merged); + break; + case 1: + ROW_MERGE_WRITE_GET_NEXT(1, goto merged); + break; + default: + ut_error; + } + + } + +merged: + if (mrec0) { + /* append all mrec0 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(0, goto done0); + } + } +done0: + if (mrec1) { + /* append all mrec1 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(1, goto done1); + } + } +done1: + + mem_heap_free(heap); + b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset); + return(b2 ? DB_SUCCESS : DB_CORRUPTION); +} + +/*************************************************************//** +Merge disk files. +@return DB_SUCCESS or error code */ +static +ulint +row_merge( +/*======*/ + const dict_index_t* index, /*!< in: index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + ulint half, /*!< in: half the file */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + int* tmpfd, /*!< in/out: temporary file handle */ + TABLE* table) /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + ulint foffs0; /*!< first input offset */ + ulint foffs1; /*!< second input offset */ + ulint error; /*!< error code */ + merge_file_t of; /*!< output file */ + + UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]); + ut_ad(half > 0); + + of.fd = *tmpfd; + of.offset = 0; + + /* Merge blocks to the output file. */ + foffs0 = 0; + foffs1 = half; + + for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) { + error = row_merge_blocks(index, file, block, + &foffs0, &foffs1, &of, table); + + if (error != DB_SUCCESS) { + return(error); + } + } + + /* Copy the last block, if there is one. */ + while (foffs0 < half) { + if (!row_merge_read(file->fd, foffs0++, block) + || !row_merge_write(of.fd, of.offset++, block)) { + return(DB_CORRUPTION); + } + } + while (foffs1 < file->offset) { + if (!row_merge_read(file->fd, foffs1++, block) + || !row_merge_write(of.fd, of.offset++, block)) { + return(DB_CORRUPTION); + } + } + + /* Swap file descriptors for the next pass. */ + *tmpfd = file->fd; + *file = of; + + UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]); + + return(DB_SUCCESS); +} + +/*************************************************************//** +Merge disk files. +@return DB_SUCCESS or error code */ +static +ulint +row_merge_sort( +/*===========*/ + const dict_index_t* index, /*!< in: index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + int* tmpfd, /*!< in/out: temporary file handle */ + TABLE* table) /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + ulint blksz; /*!< block size */ + + for (blksz = 1; blksz < file->offset; blksz *= 2) { + ulint half; + ulint error; + + ut_ad(ut_is_2pow(blksz)); + half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz); + error = row_merge(index, file, half, block, tmpfd, table); + + if (error != DB_SUCCESS) { + return(error); + } + } + + return(DB_SUCCESS); +} + +/*************************************************************//** +Copy externally stored columns to the data tuple. */ +static +void +row_merge_copy_blobs( +/*=================*/ + const mrec_t* mrec, /*!< in: merge record */ + const ulint* offsets,/*!< in: offsets of mrec */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + dtuple_t* tuple, /*!< in/out: data tuple */ + mem_heap_t* heap) /*!< in/out: memory heap */ +{ + ulint i; + ulint n_fields = dtuple_get_n_fields(tuple); + + for (i = 0; i < n_fields; i++) { + ulint len; + const void* data; + dfield_t* field = dtuple_get_nth_field(tuple, i); + + if (!dfield_is_ext(field)) { + continue; + } + + ut_ad(!dfield_is_null(field)); + + /* The table is locked during index creation. + Therefore, externally stored columns cannot possibly + be freed between the time the BLOB pointers are read + (row_merge_read_clustered_index()) and dereferenced + (below). */ + data = btr_rec_copy_externally_stored_field( + mrec, offsets, zip_size, i, &len, heap); + + dfield_set_data(field, data, len); + } +} + +/********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +static +ulint +row_merge_insert_index_tuples( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: index */ + dict_table_t* table, /*!< in: new table */ + ulint zip_size,/*!< in: compressed page size of + the old table, or 0 if uncompressed */ + int fd, /*!< in: file descriptor */ + row_merge_block_t* block) /*!< in/out: file buffer */ +{ + mrec_buf_t buf; + const byte* b; + que_thr_t* thr; + ins_node_t* node; + mem_heap_t* tuple_heap; + mem_heap_t* graph_heap; + ulint error = DB_SUCCESS; + ulint foffs = 0; + ulint* offsets; + + ut_ad(trx); + ut_ad(index); + ut_ad(table); + + /* We use the insert query graph as the dummy graph + needed in the row module call */ + + trx->op_info = "inserting index entries"; + + graph_heap = mem_heap_create(500); + node = ins_node_create(INS_DIRECT, table, graph_heap); + + thr = pars_complete_graph_for_exec(node, trx, graph_heap); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + tuple_heap = mem_heap_create(1000); + + { + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + } + + b = *block; + + if (!row_merge_read(fd, foffs, block)) { + error = DB_CORRUPTION; + } else { + for (;;) { + const mrec_t* mrec; + dtuple_t* dtuple; + ulint n_ext; + + b = row_merge_read_rec(block, &buf, b, index, + fd, &foffs, &mrec, offsets); + if (UNIV_UNLIKELY(!b)) { + /* End of list, or I/O error */ + if (mrec) { + error = DB_CORRUPTION; + } + break; + } + + dtuple = row_rec_to_index_entry_low( + mrec, index, offsets, &n_ext, tuple_heap); + + if (UNIV_UNLIKELY(n_ext)) { + row_merge_copy_blobs(mrec, offsets, zip_size, + dtuple, tuple_heap); + } + + node->row = dtuple; + node->table = table; + node->trx_id = trx->id; + + ut_ad(dtuple_validate(dtuple)); + + do { + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + error = row_ins_index_entry(index, dtuple, + 0, FALSE, thr); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + + goto next_rec; + } + + thr->lock_state = QUE_THR_LOCK_ROW; + trx->error_state = error; + que_thr_stop_for_mysql(thr); + thr->lock_state = QUE_THR_LOCK_NOLOCK; + } while (row_mysql_handle_errors(&error, trx, + thr, NULL)); + + goto err_exit; +next_rec: + mem_heap_empty(tuple_heap); + } + } + + que_thr_stop_for_mysql_no_error(thr, trx); +err_exit: + que_graph_free(thr->graph); + + trx->op_info = ""; + + mem_heap_free(tuple_heap); + + return(error); +} + +/*********************************************************************//** +Sets an exclusive lock on a table, for the duration of creating indexes. +@return error code or DB_SUCCESS */ +UNIV_INTERN +ulint +row_merge_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ +{ + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + sel_node_t* node; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + heap = mem_heap_create(512); + + trx->op_info = "setting table lock for creating or dropping index"; + + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(que_node_get_parent(thr)); + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + err = lock_table(0, table, mode, thr); + + trx->error_state = err; + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + que_thr_stop_for_mysql_no_error(thr, trx); + } else { + que_thr_stop_for_mysql(thr); + + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; + + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; + + parent = que_node_get_parent(thr); + run_thr = que_fork_start_command(parent); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + + goto run_again; + } + } + + que_graph_free(thr->graph); + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Drop an index from the InnoDB system tables. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. */ +UNIV_INTERN +void +row_merge_drop_index( +/*=================*/ + dict_index_t* index, /*!< in: index to be removed */ + dict_table_t* table, /*!< in: table */ + trx_t* trx) /*!< in: transaction handle */ +{ + ulint err; + pars_info_t* info = pars_info_create(); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + static const char str1[] = + "PROCEDURE DROP_INDEX_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID = :indexid\n" + " AND TABLE_ID = :tableid;\n" + "END;\n"; + + ut_ad(index && table && trx); + + pars_info_add_dulint_literal(info, "indexid", index->id); + pars_info_add_dulint_literal(info, "tableid", table->id); + + trx_start_if_not_started(trx); + trx->op_info = "dropping index"; + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + err = que_eval_sql(info, str1, FALSE, trx); + + ut_a(err == DB_SUCCESS); + + /* Replace this index with another equivalent index for all + foreign key constraints on this table where this index is used */ + + dict_table_replace_index_in_foreign_list(table, index); + dict_index_remove_from_cache(table, index); + + trx->op_info = ""; +} + +/*********************************************************************//** +Drop those indexes which were created before an error occurred when +building an index. The data dictionary must have been locked +exclusively by the caller, because the transaction will not be +committed. */ +UNIV_INTERN +void +row_merge_drop_indexes( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table containing the indexes */ + dict_index_t** index, /*!< in: indexes to drop */ + ulint num_created) /*!< in: number of elements in index[] */ +{ + ulint key_num; + + for (key_num = 0; key_num < num_created; key_num++) { + row_merge_drop_index(index[key_num], table, trx); + } +} + +/*********************************************************************//** +Drop all partially created indexes during crash recovery. */ +UNIV_INTERN +void +row_merge_drop_temp_indexes(void) +/*=============================*/ +{ + trx_t* trx; + ulint err; + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + static const char drop_temp_indexes[] = + "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" + "indexid CHAR;\n" + "DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n" + "WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n" + "BEGIN\n" + "\tOPEN c;\n" + "\tWHILE 1=1 LOOP\n" + "\t\tFETCH c INTO indexid;\n" + "\t\tIF (SQL % NOTFOUND) THEN\n" + "\t\t\tEXIT;\n" + "\t\tEND IF;\n" + "\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n" + "\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n" + "\tEND LOOP;\n" + "\tCLOSE c;\n" + "\tCOMMIT WORK;\n" + "END;\n"; + + trx = trx_allocate_for_background(); + trx->op_info = "dropping partially created indexes"; + row_mysql_lock_data_dictionary(trx); + + /* Incomplete transactions may be holding some locks on the + data dictionary tables. However, they should never have been + able to lock the records corresponding to the partially + created indexes that we are attempting to delete, because the + table was locked when the indexes were being created. We will + drop the partially created indexes before the rollback of + incomplete transactions is initiated. Thus, this should not + interfere with the incomplete transactions. */ + trx->isolation_level = TRX_ISO_READ_UNCOMMITTED; + err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx); + ut_a(err == DB_SUCCESS); + + row_mysql_unlock_data_dictionary(trx); + trx_free_for_background(trx); +} + +/*********************************************************************//** +Create a merge file. */ +static +void +row_merge_file_create( +/*==================*/ + merge_file_t* merge_file) /*!< out: merge file structure */ +{ + merge_file->fd = innobase_mysql_tmpfile(); + merge_file->offset = 0; +} + +/*********************************************************************//** +Destroy a merge file. */ +static +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< out: merge file structure */ +{ + if (merge_file->fd != -1) { + close(merge_file->fd); + merge_file->fd = -1; + } +} + +/*********************************************************************//** +Determine the precise type of a column that is added to a tem +if a column must be constrained NOT NULL. +@return col->prtype, possibly ORed with DATA_NOT_NULL */ +UNIV_INLINE +ulint +row_merge_col_prtype( +/*=================*/ + const dict_col_t* col, /*!< in: column */ + const char* col_name, /*!< in: name of the column */ + const merge_index_def_t*index_def) /*!< in: the index definition + of the primary key */ +{ + ulint prtype = col->prtype; + ulint i; + + ut_ad(index_def->ind_type & DICT_CLUSTERED); + + if (prtype & DATA_NOT_NULL) { + + return(prtype); + } + + /* All columns that are included + in the PRIMARY KEY must be NOT NULL. */ + + for (i = 0; i < index_def->n_fields; i++) { + if (!strcmp(col_name, index_def->fields[i].field_name)) { + return(prtype | DATA_NOT_NULL); + } + } + + return(prtype); +} + +/*********************************************************************//** +Create a temporary table for creating a primary key, using the definition +of an existing table. +@return table, or NULL on error */ +UNIV_INTERN +dict_table_t* +row_merge_create_temporary_table( +/*=============================*/ + const char* table_name, /*!< in: new table name */ + const merge_index_def_t*index_def, /*!< in: the index definition + of the primary key */ + const dict_table_t* table, /*!< in: old table definition */ + trx_t* trx) /*!< in/out: transaction + (sets error_state) */ +{ + ulint i; + dict_table_t* new_table = NULL; + ulint n_cols = dict_table_get_n_user_cols(table); + ulint error; + mem_heap_t* heap = mem_heap_create(1000); + + ut_ad(table_name); + ut_ad(index_def); + ut_ad(table); + ut_ad(mutex_own(&dict_sys->mutex)); + + new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags); + + for (i = 0; i < n_cols; i++) { + const dict_col_t* col; + const char* col_name; + + col = dict_table_get_nth_col(table, i); + col_name = dict_table_get_col_name(table, i); + + dict_mem_table_add_col(new_table, heap, col_name, col->mtype, + row_merge_col_prtype(col, col_name, + index_def), + col->len); + } + + error = row_create_table_for_mysql(new_table, trx); + mem_heap_free(heap); + + if (error != DB_SUCCESS) { + trx->error_state = error; + new_table = NULL; + } + + return(new_table); +} + +/*********************************************************************//** +Rename the temporary indexes in the dictionary to permanent ones. The +data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +ulint +row_merge_rename_indexes( +/*=====================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table) /*!< in/out: table with new indexes */ +{ + ulint err = DB_SUCCESS; + pars_info_t* info = pars_info_create(); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ + + static const char rename_indexes[] = + "PROCEDURE RENAME_INDEXES_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" + "WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='" + TEMP_INDEX_PREFIX_STR "';\n" + "END;\n"; + + ut_ad(table); + ut_ad(trx); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + trx->op_info = "renaming indexes"; + + pars_info_add_dulint_literal(info, "tableid", table->id); + + err = que_eval_sql(info, rename_indexes, FALSE, trx); + + if (err == DB_SUCCESS) { + dict_index_t* index = dict_table_get_first_index(table); + do { + if (*index->name == TEMP_INDEX_PREFIX) { + index->name++; + } + index = dict_table_get_next_index(index); + } while (index); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Rename the tables in the data dictionary. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. +@return error code or DB_SUCCESS */ +UNIV_INTERN +ulint +row_merge_rename_tables( +/*====================*/ + dict_table_t* old_table, /*!< in/out: old table, renamed to + tmp_name */ + dict_table_t* new_table, /*!< in/out: new table, renamed to + old_table->name */ + const char* tmp_name, /*!< in: new name for old_table */ + trx_t* trx) /*!< in: transaction handle */ +{ + ulint err = DB_ERROR; + pars_info_t* info; + const char* old_name= old_table->name; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(old_table != new_table); + ut_ad(mutex_own(&dict_sys->mutex)); + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + trx->op_info = "renaming tables"; + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data in system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_name", new_table->name); + pars_info_add_str_literal(info, "old_name", old_name); + pars_info_add_str_literal(info, "tmp_name", tmp_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLES () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET NAME = :tmp_name\n" + " WHERE NAME = :old_name;\n" + "UPDATE SYS_TABLES SET NAME = :old_name\n" + " WHERE NAME = :new_name;\n" + "END;\n", FALSE, trx); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + /* The following calls will also rename the .ibd data files if + the tables are stored in a single-table tablespace */ + + if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE) + || !dict_table_rename_in_cache(new_table, old_name, FALSE)) { + + err = DB_ERROR; + goto err_exit; + } + + err = dict_load_foreigns(old_name, TRUE); + + if (err != DB_SUCCESS) { +err_exit: + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Create and execute a query graph for creating an index. +@return DB_SUCCESS or error code */ +static +ulint +row_merge_create_index_graph( +/*=========================*/ + trx_t* trx, /*!< in: trx */ + dict_table_t* table, /*!< in: table */ + dict_index_t* index) /*!< in: index */ +{ + ind_node_t* node; /*!< Index creation node */ + mem_heap_t* heap; /*!< Memory heap */ + que_thr_t* thr; /*!< Query thread */ + ulint err; + + ut_ad(trx); + ut_ad(table); + ut_ad(index); + + heap = mem_heap_create(512); + + index->table = table; + node = ind_create_graph_create(index, heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + + return(err); +} + +/*********************************************************************//** +Create the index and load in to the dictionary. +@return index, or NULL on error */ +UNIV_INTERN +dict_index_t* +row_merge_create_index( +/*===================*/ + trx_t* trx, /*!< in/out: trx (sets error_state) */ + dict_table_t* table, /*!< in: the index is on this table */ + const merge_index_def_t*index_def) + /*!< in: the index definition */ +{ + dict_index_t* index; + ulint err; + ulint n_fields = index_def->n_fields; + ulint i; + + /* Create the index prototype, using the passed in def, this is not + a persistent operation. We pass 0 as the space id, and determine at + a lower level the space id where to store the table. */ + + index = dict_mem_index_create(table->name, index_def->name, + 0, index_def->ind_type, n_fields); + + ut_a(index); + + for (i = 0; i < n_fields; i++) { + merge_index_field_t* ifield = &index_def->fields[i]; + + dict_mem_index_add_field(index, ifield->field_name, + ifield->prefix_len); + } + + /* Add the index to SYS_INDEXES, using the index prototype. */ + err = row_merge_create_index_graph(trx, table, index); + + if (err == DB_SUCCESS) { + + index = row_merge_dict_table_get_index( + table, index_def); + + ut_a(index); + + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + index->trx_id = (ib_uint64_t) + ut_conv_dulint_to_longlong(trx->id); + } else { + index = NULL; + } + + return(index); +} + +/*********************************************************************//** +Check if a transaction can use an index. */ +UNIV_INTERN +ibool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to check */ +{ + return(!trx->read_view || read_view_sees_trx_id( + trx->read_view, + ut_dulint_create((ulint) (index->trx_id >> 32), + (ulint) index->trx_id & 0xFFFFFFFF))); +} + +/*********************************************************************//** +Drop the old table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ulint +row_merge_drop_table( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table to drop */ +{ + /* There must be no open transactions on the table. */ + ut_a(table->n_mysql_handles_opened == 0); + + return(row_drop_table_for_mysql(table->name, trx, FALSE)); +} + +/*********************************************************************//** +Build indexes on a table by reading a clustered index, +creating a temporary file containing index entries, merge sorting +these index entries and inserting sorted index entries to indexes. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ulint +row_merge_build_indexes( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* old_table, /*!< in: table where rows are + read from */ + dict_table_t* new_table, /*!< in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + dict_index_t** indexes, /*!< in: indexes to be created */ + ulint n_indexes, /*!< in: size of indexes[] */ + TABLE* table) /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + merge_file_t* merge_files; + row_merge_block_t* block; + ulint block_size; + ulint i; + ulint error; + int tmpfd; + + ut_ad(trx); + ut_ad(old_table); + ut_ad(new_table); + ut_ad(indexes); + ut_ad(n_indexes); + + trx_start_if_not_started(trx); + + /* Allocate memory for merge file data structure and initialize + fields */ + + merge_files = mem_alloc(n_indexes * sizeof *merge_files); + block_size = 3 * sizeof *block; + block = os_mem_alloc_large(&block_size); + + for (i = 0; i < n_indexes; i++) { + + row_merge_file_create(&merge_files[i]); + } + + tmpfd = innobase_mysql_tmpfile(); + + /* Reset the MySQL row buffer that is used when reporting + duplicate keys. */ + innobase_rec_reset(table); + + /* Read clustered index of the table and create files for + secondary index entries for merge sort */ + + error = row_merge_read_clustered_index( + trx, table, old_table, new_table, indexes, + merge_files, n_indexes, block); + + if (error != DB_SUCCESS) { + + goto func_exit; + } + + /* Now we have files containing index entries ready for + sorting and inserting. */ + + for (i = 0; i < n_indexes; i++) { + error = row_merge_sort(indexes[i], &merge_files[i], + block, &tmpfd, table); + + if (error == DB_SUCCESS) { + error = row_merge_insert_index_tuples( + trx, indexes[i], new_table, + dict_table_zip_size(old_table), + merge_files[i].fd, block); + } + + /* Close the temporary file to free up space. */ + row_merge_file_destroy(&merge_files[i]); + + if (error != DB_SUCCESS) { + trx->error_key_num = i; + goto func_exit; + } + } + +func_exit: + close(tmpfd); + + for (i = 0; i < n_indexes; i++) { + row_merge_file_destroy(&merge_files[i]); + } + + mem_free(merge_files); + os_mem_free_large(block, block_size); + + return(error); +} diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c new file mode 100644 index 00000000000..b345bb59624 --- /dev/null +++ b/storage/innobase/row/row0mysql.c @@ -0,0 +1,4241 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0mysql.c +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#include "row0ins.h" +#include "row0merge.h" +#include "row0sel.h" +#include "row0upd.h" +#include "row0row.h" +#include "que0que.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "dict0crea.h" +#include "dict0load.h" +#include "dict0boot.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0undo.h" +#include "lock0lock.h" +#include "rem0cmp.h" +#include "log0log.h" +#include "btr0sea.h" +#include "fil0fil.h" +#include "ibuf0ibuf.h" + +/** Provide optional 4.x backwards compatibility for 5.0 and above */ +UNIV_INTERN ibool row_rollback_on_timeout = FALSE; + +/** Chain node of the list of tables to drop in the background. */ +typedef struct row_mysql_drop_struct row_mysql_drop_t; + +/** Chain node of the list of tables to drop in the background. */ +struct row_mysql_drop_struct{ + char* table_name; /*!< table name */ + UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list; + /*!< list chain node */ +}; + +/** @brief List of tables we should drop in background. + +ALTER TABLE in MySQL requires that the table handler can drop the +table in background when there are no queries to it any +more. Protected by kernel_mutex. */ +static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list; +/** Flag: has row_mysql_drop_list been initialized? */ +static ibool row_mysql_drop_list_inited = FALSE; + +/** Magic table names for invoking various monitor threads */ +/* @{ */ +static const char S_innodb_monitor[] = "innodb_monitor"; +static const char S_innodb_lock_monitor[] = "innodb_lock_monitor"; +static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor"; +static const char S_innodb_table_monitor[] = "innodb_table_monitor"; +static const char S_innodb_mem_validate[] = "innodb_mem_validate"; +/* @} */ + +/** Evaluates to true if str1 equals str2_onstack, used for comparing +the magic table names. +@param str1 in: string to compare +@param str1_len in: length of str1, in bytes, including terminating NUL +@param str2_onstack in: char[] array containing a NUL terminated string +@return TRUE if str1 equals str2_onstack */ +#define STR_EQ(str1, str1_len, str2_onstack) \ + ((str1_len) == sizeof(str2_onstack) \ + && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0) + +/*******************************************************************//** +Determine if the given name is a name reserved for MySQL system tables. +@return TRUE if name is a MySQL system table name */ +static +ibool +row_mysql_is_system_table( +/*======================*/ + const char* name) +{ + if (strncmp(name, "mysql/", 6) != 0) { + + return(FALSE); + } + + return(0 == strcmp(name + 6, "host") + || 0 == strcmp(name + 6, "user") + || 0 == strcmp(name + 6, "db")); +} + +/*********************************************************************//** +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. +@return TRUE if the table was not yet in the drop list, and was added there */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + const char* name); /*!< in: table name */ + +/*******************************************************************//** +Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */ +static +void +row_mysql_delay_if_needed(void) +/*===========================*/ +{ + if (srv_dml_needed_delay) { + os_thread_sleep(srv_dml_needed_delay); + } +} + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +UNIV_INTERN +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +{ + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; +} + +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen) /*!< in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + ut_a(len < 256 * 256); + + mach_write_to_2_little_endian(dest, len); + + return(dest + 2); + } + + ut_a(lenlen == 1); + ut_a(len < 256); + + mach_write_to_1(dest, len); + + return(dest + 1); +} + +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen) /*!< in: storage length of len: either 1 + or 2 bytes */ +{ + if (lenlen == 2) { + *len = mach_read_from_2_little_endian(field); + + return(field + 2); + } + + ut_a(lenlen == 1); + + *len = mach_read_from_1(field); + + return(field + 1); +} + +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len) /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +{ + /* MySQL might assume the field is set to zero except the length and + the pointer fields */ + + memset(dest, '\0', col_len); + + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + ut_a(col_len - 8 > 1 || len < 256); + ut_a(col_len - 8 > 2 || len < 256 * 256); + ut_a(col_len - 8 > 3 || len < 256 * 256 * 256); + + mach_write_to_n_little_endian(dest, col_len - 8, len); + + memcpy(dest + col_len - 8, &data, sizeof data); +} + +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +UNIV_INTERN +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len) /*!< in: BLOB reference length + (not BLOB length) */ +{ + byte* data; + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + memcpy(&data, ref + col_len - 8, sizeof data); + + return(data); +} + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.c. +@return up to which byte we used buf in the conversion */ +UNIV_INTERN +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp) /*!< in: nonzero=compact format */ +{ + const byte* ptr = mysql_data; + const dtype_t* dtype; + ulint type; + ulint lenlen; + + dtype = dfield_get_type(dfield); + + type = dtype->mtype; + + if (type == DATA_INT) { + /* Store integer data in Innobase in a big-endian format, + sign bit negated if the data is a signed integer. In MySQL, + integers are stored in a little-endian format. */ + + byte* p = buf + col_len; + + for (;;) { + p--; + *p = *mysql_data; + if (p == buf) { + break; + } + mysql_data++; + } + + if (!(dtype->prtype & DATA_UNSIGNED)) { + + *buf ^= 128; + } + + ptr = buf; + buf += col_len; + } else if ((type == DATA_VARCHAR + || type == DATA_VARMYSQL + || type == DATA_BINARY)) { + + if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) { + /* The length of the actual data is stored to 1 or 2 + bytes at the start of the field */ + + if (row_format_col) { + if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) { + lenlen = 2; + } else { + lenlen = 1; + } + } else { + /* In a MySQL key value, lenlen is always 2 */ + lenlen = 2; + } + + ptr = row_mysql_read_true_varchar(&col_len, mysql_data, + lenlen); + } else { + /* Remove trailing spaces from old style VARCHAR + columns. */ + + /* Handle UCS2 strings differently. */ + ulint mbminlen = dtype_get_mbminlen(dtype); + + ptr = mysql_data; + + if (mbminlen == 2) { + /* space=0x0020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~1; + + while (col_len >= 2 && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 2; + } + } else { + ut_a(mbminlen == 1); + /* space=0x20 */ + while (col_len > 0 + && ptr[col_len - 1] == 0x20) { + col_len--; + } + } + } + } else if (comp && type == DATA_MYSQL + && dtype_get_mbminlen(dtype) == 1 + && dtype_get_mbmaxlen(dtype) > 1) { + /* In some cases we strip trailing spaces from UTF-8 and other + multibyte charsets, from FIXED-length CHAR columns, to save + space. UTF-8 would otherwise normally use 3 * the string length + bytes to store an ASCII string! */ + + /* We assume that this CHAR field is encoded in a + variable-length character set where spaces have + 1:1 correspondence to 0x20 bytes, such as UTF-8. + + Consider a CHAR(n) field, a field of n characters. + It will contain between n * mbminlen and n * mbmaxlen bytes. + We will try to truncate it to n bytes by stripping + space padding. If the field contains single-byte + characters only, it will be truncated to n characters. + Consider a CHAR(5) field containing the string ".a " + where "." denotes a 3-byte character represented by + the bytes "$%&". After our stripping, the string will + be stored as "$%&a " (5 bytes). The string ".abc " + will be stored as "$%&abc" (6 bytes). + + The space padding will be restored in row0sel.c, function + row_sel_field_store_in_mysql_format(). */ + + ulint n_chars; + + ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars && ptr[col_len - 1] == 0x20) { + col_len--; + } + } else if (type == DATA_BLOB && row_format_col) { + + ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); + } + + dfield_set_data(dfield, ptr, col_len); + + return(buf); +} + +/**************************************************************//** +Convert a row in the MySQL format to a row in the Innobase format. Note that +the function to convert a MySQL format key value to an InnoDB dtuple is +row_sel_convert_mysql_key_to_innobase() in row0sel.c. */ +static +void +row_mysql_convert_row_to_innobase( +/*==============================*/ + dtuple_t* row, /*!< in/out: Innobase row where the + field type information is already + copied there! */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct where template + must be of type ROW_MYSQL_WHOLE_ROW */ + byte* mysql_rec) /*!< in: row in the MySQL format; + NOTE: do not discard as long as + row is used, as row may contain + pointers to this record! */ +{ + mysql_row_templ_t* templ; + dfield_t* dfield; + ulint i; + + ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(prebuilt->mysql_template); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + dfield = dtuple_get_nth_field(row, i); + + if (templ->mysql_null_bit_mask != 0) { + /* Column may be SQL NULL */ + + if (mysql_rec[templ->mysql_null_byte_offset] + & (byte) (templ->mysql_null_bit_mask)) { + + /* It is SQL NULL */ + + dfield_set_null(dfield); + + goto next_column; + } + } + + row_mysql_store_col_in_innobase_format( + dfield, + prebuilt->ins_upd_rec_buff + templ->mysql_col_offset, + TRUE, /* MySQL row format data */ + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, + dict_table_is_comp(prebuilt->table)); +next_column: + ; + } +} + +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return TRUE if it was a lock wait and we should continue running the +query thread */ +UNIV_INTERN +ibool +row_mysql_handle_errors( +/*====================*/ + ulint* new_err,/*!< out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread */ + trx_savept_t* savept) /*!< in: savepoint or NULL */ +{ + ulint err; + +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + switch (err) { + case DB_LOCK_WAIT_TIMEOUT: + if (row_rollback_on_timeout) { + trx_general_rollback_for_mysql(trx, FALSE, NULL); + break; + } + /* fall through */ + case DB_DUPLICATE_KEY: + case DB_FOREIGN_DUPLICATE_KEY: + case DB_TOO_BIG_RECORD: + case DB_ROW_IS_REFERENCED: + case DB_NO_REFERENCED_ROW: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_TOO_MANY_CONCURRENT_TRXS: + case DB_OUT_OF_FILE_SPACE: + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + /* MySQL will roll back the latest SQL statement */ + break; + case DB_LOCK_WAIT: + srv_suspend_mysql_thread(thr); + + if (trx->error_state != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(TRUE); + + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + break; + + case DB_MUST_GET_MORE_FILE_SPACE: + fputs("InnoDB: The database cannot continue" + " operation because of\n" + "InnoDB: lack of space. You must add" + " a new data file to\n" + "InnoDB: my.cnf and restart the database.\n", stderr); + + exit(1); + + case DB_CORRUPTION: + fputs("InnoDB: We detected index corruption" + " in an InnoDB type table.\n" + "InnoDB: You have to dump + drop + reimport" + " the table or, in\n" + "InnoDB: a case of widespread corruption," + " dump all InnoDB\n" + "InnoDB: tables and recreate the" + " whole InnoDB tablespace.\n" + "InnoDB: If the mysqld server crashes" + " after the startup or when\n" + "InnoDB: you dump the tables, look at\n" + "InnoDB: " REFMAN "forcing-recovery.html" + " for help.\n", stderr); + break; + default: + fprintf(stderr, "InnoDB: unknown error code %lu\n", + (ulong) err); + ut_error; + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(FALSE); +} + +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +UNIV_INTERN +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table) /*!< in: Innobase table handle */ +{ + row_prebuilt_t* prebuilt; + mem_heap_t* heap; + dict_index_t* clust_index; + dtuple_t* ref; + ulint ref_len; + + heap = mem_heap_create(sizeof *prebuilt + 128); + + prebuilt = mem_heap_zalloc(heap, sizeof *prebuilt); + + prebuilt->magic_n = ROW_PREBUILT_ALLOCATED; + prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED; + + prebuilt->table = table; + + prebuilt->sql_stat_start = TRUE; + prebuilt->heap = heap; + + prebuilt->pcur = btr_pcur_create_for_mysql(); + prebuilt->clust_pcur = btr_pcur_create_for_mysql(); + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = 99999999; + + prebuilt->search_tuple = dtuple_create( + heap, 2 * dict_table_get_n_cols(table)); + + clust_index = dict_table_get_first_index(table); + + /* Make sure that search_tuple is long enough for clustered index */ + ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + prebuilt->clust_ref = ref; + + prebuilt->autoinc_error = 0; + prebuilt->autoinc_offset = 0; + + /* Default to 1, we will set the actual value later in + ha_innobase::get_auto_increment(). */ + prebuilt->autoinc_increment = 1; + + prebuilt->autoinc_last_value = 0; + + return(prebuilt); +} + +/********************************************************************//** +Free a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */ + ibool dict_locked) /*!< in: TRUE=data dictionary locked */ +{ + ulint i; + + if (UNIV_UNLIKELY + (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED + || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) { + + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu," + " magic n2 %lu, table name ", + (ulong) prebuilt->magic_n, + (ulong) prebuilt->magic_n2); + ut_print_name(stderr, NULL, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + prebuilt->magic_n = ROW_PREBUILT_FREED; + prebuilt->magic_n2 = ROW_PREBUILT_FREED; + + btr_pcur_free_for_mysql(prebuilt->pcur); + btr_pcur_free_for_mysql(prebuilt->clust_pcur); + + if (prebuilt->mysql_template) { + mem_free(prebuilt->mysql_template); + } + + if (prebuilt->ins_graph) { + que_graph_free_recursive(prebuilt->ins_graph); + } + + if (prebuilt->sel_graph) { + que_graph_free_recursive(prebuilt->sel_graph); + } + + if (prebuilt->upd_graph) { + que_graph_free_recursive(prebuilt->upd_graph); + } + + if (prebuilt->blob_heap) { + mem_heap_free(prebuilt->blob_heap); + } + + if (prebuilt->old_vers_heap) { + mem_heap_free(prebuilt->old_vers_heap); + } + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + if (prebuilt->fetch_cache[i] != NULL) { + + if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4( + (prebuilt->fetch_cache[i]) - 4)) + || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4( + (prebuilt->fetch_cache[i]) + + prebuilt->mysql_row_len))) { + fputs("InnoDB: Error: trying to free" + " a corrupt fetch buffer.\n", stderr); + + mem_analyze_corruption( + prebuilt->fetch_cache[i]); + + ut_error; + } + + mem_free((prebuilt->fetch_cache[i]) - 4); + } + } + + dict_table_decrement_handle_count(prebuilt->table, dict_locked); + + mem_heap_free(prebuilt->heap); +} + +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +UNIV_INTERN +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx) /*!< in: transaction handle */ +{ + if (trx->magic_n != TRX_MAGIC_N) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: trx handle. Magic n %lu\n", + (ulong) trx->magic_n); + + mem_analyze_corruption(trx); + + ut_error; + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + prebuilt->trx = trx; + + if (prebuilt->ins_graph) { + prebuilt->ins_graph->trx = trx; + } + + if (prebuilt->upd_graph) { + prebuilt->upd_graph->trx = trx; + } + + if (prebuilt->sel_graph) { + prebuilt->sel_graph->trx = trx; + } +} + +/*********************************************************************//** +Gets pointer to a prebuilt dtuple used in insertions. If the insert graph +has not yet been built in the prebuilt struct, then this function first +builds it. +@return prebuilt dtuple; the column type information is also set in it */ +static +dtuple_t* +row_get_prebuilt_insert_row( +/*========================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + ins_node_t* node; + dtuple_t* row; + dict_table_t* table = prebuilt->table; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->ins_node == NULL) { + + /* Not called before for this handle: create an insert node + and query graph to the prebuilt struct */ + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == NULL) { + prebuilt->ins_upd_rec_buff = mem_heap_alloc( + prebuilt->heap, prebuilt->mysql_row_len); + } + + row = dtuple_create(prebuilt->heap, + dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->ins_node->row); +} + +/*********************************************************************//** +Updates the table modification counter and calculates new estimates +for table and index statistics if necessary. */ +UNIV_INLINE +void +row_update_statistics_if_needed( +/*============================*/ + dict_table_t* table) /*!< in: table */ +{ + ulint counter; + + counter = table->stat_modified_counter; + + table->stat_modified_counter = counter + 1; + + /* Calculate new statistics if 1 / 16 of table has been modified + since the last time a statistics batch was run, or if + stat_modified_counter > 2 000 000 000 (to avoid wrap-around). + We calculate statistics at most every 16th round, since we may have + a counter table which is very small and updated very often. */ + + if (counter > 2000000000 + || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) { + + dict_update_statistics(table); + } +} + +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. */ +UNIV_INTERN +void +row_unlock_table_autoinc_for_mysql( +/*===============================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mutex_enter(&kernel_mutex); + + lock_release_autoinc_locks(trx); + + mutex_exit(&kernel_mutex); +} + +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ +{ + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + const dict_table_t* table = prebuilt->table; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + /* If we already hold an AUTOINC lock on the table then do nothing. + Note: We peek at the value of the current owner without acquiring + the kernel mutex. **/ + if (trx == table->autoinc_trx) { + + return(DB_SUCCESS); + } + + trx->op_info = "setting auto-inc lock"; + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + /* We use the insert query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return((int) err); +} + +/*********************************************************************//** +Sets a table lock on the table mentioned in prebuilt. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_lock_table_for_mysql( +/*=====================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /*!< in: table to lock, or NULL + if prebuilt->table should be + locked as + prebuilt->select_lock_type */ + ulint mode) /*!< in: lock mode of table + (ignored if table==NULL) */ +{ + trx_t* trx = prebuilt->trx; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "setting table lock"; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + if (table) { + err = lock_table(0, table, mode, thr); + } else { + err = lock_table(0, prebuilt->table, + prebuilt->select_lock_type, thr); + } + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return((int) err); +} + +/*********************************************************************//** +Does an insert for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_insert_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that" + " newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "inserting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started(trx); + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec); + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + if (prebuilt->sql_stat_start) { + node->state = INS_NODE_SET_IX_LOCK; + prebuilt->sql_stat_start = FALSE; + } else { + node->state = INS_NODE_ALLOC_ROW_ID; + } + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + /* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW; + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + prebuilt->table->stat_n_rows++; + + srv_n_rows_inserted++; + + if (prebuilt->table->stat_n_rows == 0) { + /* Avoid wrap-over */ + prebuilt->table->stat_n_rows--; + } + + row_update_statistics_if_needed(prebuilt->table); + trx->op_info = ""; + + return((int) err); +} + +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +UNIV_INTERN +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + sel_node_t* node; + + ut_ad(prebuilt && prebuilt->trx); + + if (prebuilt->sel_graph == NULL) { + + node = sel_node_create(prebuilt->heap); + + prebuilt->sel_graph = que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + + prebuilt->sel_graph->state = QUE_FORK_ACTIVE; + } +} + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +UNIV_INTERN +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap) /*!< in: mem heap from which allocated */ +{ + upd_node_t* node; + + node = upd_node_create(heap); + + node->in_mysql_interface = TRUE; + node->is_delete = FALSE; + node->searched_update = FALSE; + node->select = NULL; + node->pcur = btr_pcur_create_for_mysql(); + node->table = table; + + node->update = upd_create(dict_table_get_n_cols(table), heap); + + node->update_n_fields = dict_table_get_n_cols(table); + + UT_LIST_INIT(node->columns); + node->has_clust_rec_x_lock = TRUE; + node->cmpl_info = 0; + + node->table_sym = NULL; + node->col_assign_list = NULL; + + return(node); +} + +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +UNIV_INTERN +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + upd_node_t* node; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->upd_node == NULL) { + + /* Not called before for this handle: create an update node + and query graph to the prebuilt struct */ + + node = row_create_update_node_for_mysql(table, prebuilt->heap); + + prebuilt->upd_node = node; + + prebuilt->upd_graph = que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->upd_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->upd_node->update); +} + +/*********************************************************************//** +Does an update or delete of a row for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_update_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + ulint err; + que_thr_t* thr; + ibool was_lock_wait; + dict_index_t* clust_index; + /* ulint ref_len; */ + upd_node_t* node; + dict_table_t* table = prebuilt->table; + trx_t* trx = prebuilt->trx; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + UT_NOT_USED(mysql_rec); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw" + " is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "updating or deleting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started(trx); + + node = prebuilt->upd_node; + + clust_index = dict_table_get_first_index(table); + + if (prebuilt->pcur->btr_cur.index == clust_index) { + btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur); + } else { + btr_pcur_copy_stored_position(node->pcur, + prebuilt->clust_pcur); + } + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + /* MySQL seems to call rnd_pos before updating each row it + has cached: we can get the correct cursor position from + prebuilt->pcur; NOTE that we cannot build the row reference + from mysql_rec if the clustered index was automatically + generated for the table: MySQL does not know anything about + the row id used as the clustered index key */ + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->upd_graph); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + ut_ad(!prebuilt->sql_stat_start); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + + return((int) err); + } + + thr->lock_state= QUE_THR_LOCK_ROW; + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (node->is_delete) { + if (prebuilt->table->stat_n_rows > 0) { + prebuilt->table->stat_n_rows--; + } + + srv_n_rows_deleted++; + } else { + srv_n_rows_updated++; + } + + row_update_statistics_if_needed(prebuilt->table); + + trx->op_info = ""; + + return((int) err); +} + +/*********************************************************************//** +This can only be used when srv_locks_unsafe_for_binlog is TRUE or +this session is using a READ COMMITTED isolation level. Before +calling this function we must use trx_reset_new_rec_lock_info() and +trx_register_new_rec_lock() to store the information which new record locks +really were set. This function removes a newly set lock under prebuilt->pcur, +and also under prebuilt->clust_pcur. Currently, this is only used and tested +in the case of an UPDATE or a DELETE statement, where the row lock is of the +LOCK_X type. +Thus, this implements a 'mini-rollback' that releases the latest record +locks we set. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_unlock_for_mysql( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs)/*!< TRUE if called so that we have + the latches on the records under pcur + and clust_pcur, and we do not need to + reposition the cursors. */ +{ + btr_pcur_t* pcur = prebuilt->pcur; + btr_pcur_t* clust_pcur = prebuilt->clust_pcur; + trx_t* trx = prebuilt->trx; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (UNIV_UNLIKELY + (!srv_locks_unsafe_for_binlog + && trx->isolation_level != TRX_ISO_READ_COMMITTED)) { + + fprintf(stderr, + "InnoDB: Error: calling row_unlock_for_mysql though\n" + "InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n" + "InnoDB: this session is not using" + " READ COMMITTED isolation level.\n"); + + return(DB_SUCCESS); + } + + trx->op_info = "unlock_row"; + + if (prebuilt->new_rec_locks >= 1) { + + const rec_t* rec; + dict_index_t* index; + trx_id_t rec_trx_id; + mtr_t mtr; + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr); + } + + rec = btr_pcur_get_rec(pcur); + index = btr_pcur_get_btr_cur(pcur)->index; + + if (prebuilt->new_rec_locks >= 2) { + /* Restore the cursor position and find the record + in the clustered index. */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, + clust_pcur, &mtr); + } + + rec = btr_pcur_get_rec(clust_pcur); + index = btr_pcur_get_btr_cur(clust_pcur)->index; + } + + if (UNIV_UNLIKELY(!dict_index_is_clust(index))) { + /* This is not a clustered index record. We + do not know how to unlock the record. */ + goto no_unlock; + } + + /* If the record has been modified by this + transaction, do not unlock it. */ + + if (index->trx_id_offset) { + rec_trx_id = trx_read_trx_id(rec + + index->trx_id_offset); + } else { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + if (ut_dulint_cmp(rec_trx_id, trx->id) != 0) { + /* We did not update the record: unlock it */ + + rec = btr_pcur_get_rec(pcur); + index = btr_pcur_get_btr_cur(pcur)->index; + + lock_rec_unlock(trx, btr_pcur_get_block(pcur), + rec, prebuilt->select_lock_type); + + if (prebuilt->new_rec_locks >= 2) { + rec = btr_pcur_get_rec(clust_pcur); + index = btr_pcur_get_btr_cur(clust_pcur)->index; + + lock_rec_unlock(trx, + btr_pcur_get_block(clust_pcur), + rec, + prebuilt->select_lock_type); + } + } +no_unlock: + mtr_commit(&mtr); + } + + trx->op_info = ""; + + return(DB_SUCCESS); +} + +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +UNIV_INTERN +ulint +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ +{ + ulint err; + trx_t* trx; + + trx = thr_get_trx(thr); +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + /* Note that the cascade node is a subnode of another InnoDB + query graph node. We do a normal lock wait in this node, but + all errors are handled by the parent node. */ + + if (err == DB_LOCK_WAIT) { + /* Handle lock wait here */ + + que_thr_stop_for_mysql(thr); + + srv_suspend_mysql_thread(thr); + + /* Note that a lock wait may also end in a lock wait timeout, + or this transaction is picked as a victim in selective + deadlock resolution */ + + if (trx->error_state != DB_SUCCESS) { + + return(trx->error_state); + } + + /* Retry operation after a normal lock wait */ + + goto run_again; + } + + if (err != DB_SUCCESS) { + + return(err); + } + + if (node->is_delete) { + if (table->stat_n_rows > 0) { + table->stat_n_rows--; + } + + srv_n_rows_deleted++; + } else { + srv_n_rows_updated++; + } + + row_update_statistics_if_needed(table); + + return(err); +} + +/*********************************************************************//** +Checks if a table is such that we automatically created a clustered +index on it (on row id). +@return TRUE if the clustered index was generated automatically */ +UNIV_INTERN +ibool +row_table_got_default_clust_index( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + const dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(table); + + return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS); +} + +/*********************************************************************//** +Calculates the key number used inside MySQL for an Innobase index. We have +to take into account if we generated a default clustered index for the table +@return the key number used inside MySQL */ +UNIV_INTERN +ulint +row_get_mysql_key_number_for_index( +/*===============================*/ + const dict_index_t* index) /*!< in: index */ +{ + const dict_index_t* ind; + ulint i; + + ut_a(index); + + i = 0; + ind = dict_table_get_first_index(index->table); + + while (index != ind) { + ind = dict_table_get_next_index(ind); + i++; + } + + if (row_table_got_default_clust_index(index->table)) { + ut_a(i > 0); + i--; + } + + return(i); +} + +/*********************************************************************//** +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +UNIV_INTERN +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line) /*!< in: line number */ +{ + ut_a(trx->dict_operation_lock_mode == 0); + + rw_lock_s_lock_func(&dict_operation_lock, 0, file, line); + + trx->dict_operation_lock_mode = RW_S_LATCH; +} + +/*********************************************************************//** +Unlocks the data dictionary shared lock. */ +UNIV_INTERN +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); + + rw_lock_s_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/*********************************************************************//** +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +UNIV_INTERN +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line) /*!< in: line number */ +{ + ut_a(trx->dict_operation_lock_mode == 0 + || trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks or lock waits can occur then in these operations */ + + rw_lock_x_lock_func(&dict_operation_lock, 0, file, line); + trx->dict_operation_lock_mode = RW_X_LATCH; + + mutex_enter(&(dict_sys->mutex)); +} + +/*********************************************************************//** +Unlocks the data dictionary exclusive lock. */ +UNIV_INTERN +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_exit(&(dict_sys->mutex)); + rw_lock_x_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/*********************************************************************//** +Creates a table for MySQL. If the name of the table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also start the printing of monitor +output by the master thread. If the table name ends in "innodb_mem_validate", +InnoDB will try to invoke mem_validate(). +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed) */ + trx_t* trx) /*!< in: transaction handle */ +{ + tab_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + const char* table_name; + ulint table_name_len; + ulint err; + ulint i; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); +err_exit: + dict_mem_table_free(table); + trx_commit_for_mysql(trx); + + return(DB_ERROR); + } + + trx->op_info = "creating table"; + + if (row_mysql_is_system_table(table->name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL system" + " table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be" + " of the MyISAM type!\n", + table->name); + goto err_exit; + } + + /* Check that no reserved column names are used. */ + for (i = 0; i < dict_table_get_n_user_cols(table); i++) { + if (dict_col_name_is_reserved( + dict_table_get_col_name(table, i))) { + + goto err_exit; + } + } + + trx_start_if_not_started(trx); + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = strchr(table->name, '/'); + ut_a(table_name); + table_name++; + table_name_len = strlen(table_name) + 1; + + if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) { + + /* Table equals "innodb_monitor": + start monitor prints */ + + srv_print_innodb_monitor = TRUE; + + /* The lock timeout monitor thread also takes care + of InnoDB monitor prints */ + + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_lock_monitor)) { + + srv_print_innodb_monitor = TRUE; + srv_print_innodb_lock_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_mem_validate)) { + /* We define here a debugging feature intended for + developers */ + + fputs("Validating InnoDB memory:\n" + "to use this feature you must compile InnoDB with\n" + "UNIV_MEM_DEBUG defined in univ.i and" + " the server must be\n" + "quiet because allocation from a mem heap" + " is not protected\n" + "by any semaphore.\n", stderr); +#ifdef UNIV_MEM_DEBUG + ut_a(mem_validate()); + fputs("Memory validated\n", stderr); +#else /* UNIV_MEM_DEBUG */ + fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n", + stderr); +#endif /* UNIV_MEM_DEBUG */ + } + + heap = mem_heap_create(512); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + node = tab_create_graph_create(table, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + err = trx->error_state; + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + } + + switch (err) { + case DB_OUT_OF_FILE_SPACE: + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: cannot create table ", + stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" because tablespace full\n", stderr); + + if (dict_table_get_low(table->name)) { + + row_drop_table_for_mysql(table->name, trx, FALSE); + trx_commit_for_mysql(trx); + } + break; + + case DB_DUPLICATE_KEY: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" already exists in InnoDB internal\n" + "InnoDB: data dictionary. Have you deleted" + " the .frm file\n" + "InnoDB: and not used DROP TABLE?" + " Have you used DROP DATABASE\n" + "InnoDB: for InnoDB tables in" + " MySQL version <= 3.23.43?\n" + "InnoDB: See the Restrictions section" + " of the InnoDB manual.\n" + "InnoDB: You can drop the orphaned table" + " inside InnoDB by\n" + "InnoDB: creating an InnoDB table with" + " the same name in another\n" + "InnoDB: database and copying the .frm file" + " to the current database.\n" + "InnoDB: Then MySQL thinks the table exists," + " and DROP TABLE will\n" + "InnoDB: succeed.\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n", + stderr); + + /* We may also get err == DB_ERROR if the .ibd file for the + table already exists */ + + break; + } + + que_graph_free((que_t*) que_node_get_parent(thr)); + + trx->op_info = ""; + + return((int) err); +} + +/*********************************************************************//** +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. +@return error number or DB_SUCCESS */ +UNIV_INTERN +int +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths) /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ +{ + ind_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + ulint i; + ulint len; + char* table_name; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "creating index"; + + /* Copy the table name because we may want to drop the + table later, after the index object is freed (inside + que_run_threads()) and thus index->table_name is not available. */ + table_name = mem_strdup(index->table_name); + + trx_start_if_not_started(trx); + + /* Check that the same column does not appear twice in the index. + Starting from 4.0.14, InnoDB should be able to cope with that, but + safer not to allow them. */ + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + ulint j; + + for (j = 0; j < i; j++) { + if (0 == ut_strcmp( + dict_index_get_nth_field(index, j)->name, + dict_index_get_nth_field(index, i)->name)) { + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: column ", stderr); + ut_print_name(stderr, trx, FALSE, + dict_index_get_nth_field( + index, i)->name); + fputs(" appears twice in ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: This is not allowed" + " in InnoDB.\n", stderr); + + err = DB_COL_APPEARS_TWICE_IN_INDEX; + + goto error_handling; + } + } + + /* Check also that prefix_len and actual length + < DICT_MAX_INDEX_COL_LEN */ + + len = dict_index_get_nth_field(index, i)->prefix_len; + + if (field_lengths) { + len = ut_max(len, field_lengths[i]); + } + + if (len >= DICT_MAX_INDEX_COL_LEN) { + err = DB_TOO_BIG_RECORD; + + goto error_handling; + } + } + + heap = mem_heap_create(512); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + /* Note that the space id where we store the index is inherited from + the table in dict_build_index_def_step() in dict0crea.c. */ + + node = ind_create_graph_create(index, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + +error_handling: + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(table_name, trx, FALSE); + + trx_commit_for_mysql(trx); + + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + mem_free(table_name); + + return((int) err); +} + +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. Check also that foreign key +constraints which reference this table are ok. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_table_add_foreign_constraints( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ +{ + ulint err; + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(sql_string); + + trx->op_info = "adding foreign keys"; + + trx_start_if_not_started(trx); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + err = dict_create_foreign_constraints(trx, sql_string, name, + reject_fks); + if (err == DB_SUCCESS) { + /* Check that also referencing constraints are ok */ + err = dict_load_foreigns(name, TRUE); + } + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(name, trx, FALSE); + + trx_commit_for_mysql(trx); + + trx->error_state = DB_SUCCESS; + } + + return((int) err); +} + +/*********************************************************************//** +Drops a table for MySQL as a background operation. MySQL relies on Unix +in ALTER TABLE to the fact that the table handler does not remove the +table before all handles to it has been removed. Furhermore, the MySQL's +call to drop table must be non-blocking. Therefore we do the drop table +as a background operation, which is taken care of by the master thread +in srv0srv.c. +@return error code or DB_SUCCESS */ +static +int +row_drop_table_for_mysql_in_background( +/*===================================*/ + const char* name) /*!< in: table name */ +{ + ulint error; + trx_t* trx; + + trx = trx_allocate_for_background(); + + /* If the original transaction was dropping a table referenced by + foreign keys, we must set the following to be able to drop the + table: */ + + trx->check_foreigns = FALSE; + + /* fputs("InnoDB: Error: Dropping table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" in background drop list\n", stderr); */ + + /* Try to drop the table in InnoDB */ + + error = row_drop_table_for_mysql(name, trx, FALSE); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + trx_commit_for_mysql(trx); + + trx_free_for_background(trx); + + return((int) error); +} + +/*********************************************************************//** +The master thread in srv0srv.c calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. +@return how many tables dropped + remaining tables in list */ +UNIV_INTERN +ulint +row_drop_tables_for_mysql_in_background(void) +/*=========================================*/ +{ + row_mysql_drop_t* drop; + dict_table_t* table; + ulint n_tables; + ulint n_tables_dropped = 0; +loop: + mutex_enter(&kernel_mutex); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + n_tables = UT_LIST_GET_LEN(row_mysql_drop_list); + + mutex_exit(&kernel_mutex); + + if (drop == NULL) { + /* All tables dropped */ + + return(n_tables + n_tables_dropped); + } + + mutex_enter(&(dict_sys->mutex)); + table = dict_table_get_low(drop->table_name); + mutex_exit(&(dict_sys->mutex)); + + if (table == NULL) { + /* If for some reason the table has already been dropped + through some other mechanism, do not try to drop it */ + + goto already_dropped; + } + + if (DB_SUCCESS != row_drop_table_for_mysql_in_background( + drop->table_name)) { + /* If the DROP fails for some table, we return, and let the + main thread retry later */ + + return(n_tables + n_tables_dropped); + } + + n_tables_dropped++; + +already_dropped: + mutex_enter(&kernel_mutex); + + UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Dropped table ", stderr); + ut_print_name(stderr, NULL, TRUE, drop->table_name); + fputs(" in background drop queue.\n", stderr); + + mem_free(drop->table_name); + + mem_free(drop); + + mutex_exit(&kernel_mutex); + + goto loop; +} + +/*********************************************************************//** +Get the background drop list length. NOTE: the caller must own the kernel +mutex! +@return how many tables in list */ +UNIV_INTERN +ulint +row_get_background_drop_list_len_low(void) +/*======================================*/ +{ + ut_ad(mutex_own(&kernel_mutex)); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + return(UT_LIST_GET_LEN(row_mysql_drop_list)); +} + +/*********************************************************************//** +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. +@return TRUE if the table was not yet in the drop list, and was added there */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + const char* name) /*!< in: table name */ +{ + row_mysql_drop_t* drop; + + mutex_enter(&kernel_mutex); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + /* Look if the table already is in the drop list */ + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + while (drop != NULL) { + if (strcmp(drop->table_name, name) == 0) { + /* Already in the list */ + + mutex_exit(&kernel_mutex); + + return(FALSE); + } + + drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop); + } + + drop = mem_alloc(sizeof(row_mysql_drop_t)); + + drop->table_name = mem_strdup(name); + + UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop); + + /* fputs("InnoDB: Adding table ", stderr); + ut_print_name(stderr, trx, TRUE, drop->table_name); + fputs(" to background drop list\n", stderr); */ + + mutex_exit(&kernel_mutex); + + return(TRUE); +} + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_discard_tablespace_for_mysql( +/*=============================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ +{ + dict_foreign_t* foreign; + dulint new_id; + dict_table_t* table; + ibool success; + ulint err; + pars_info_t* info = NULL; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. + + 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive + MySQL table lock on the table before we can do DISCARD + TABLESPACE. Then there are no running queries on the table. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: we remove all entries for the tablespace in + the insert buffer tree; as long as the tablespace mem object + does not exist, ongoing insert buffer page merges are + discarded in buf0rea.c. If we recreate the tablespace mem + object with IMPORT TABLESPACE later, then the tablespace will + have the same id, but the tablespace_version field in the mem + object is different, and ongoing old insert buffer page merges + get discarded. + + 4) Linear readahead and random readahead: we use the same + method as in 3) to discard ongoing operations. + + 5) FOREIGN KEY operations: if + table->n_foreign_key_checks_running > 0, we do not allow the + discard. We also reserve the data dictionary latch. */ + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "discarding tablespace"; + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: is in the system tablespace 0" + " which cannot be discarded\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (table->n_foreign_key_checks_running > 0) { + + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to DISCARD table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: though there is a foreign key check" + " running on it.\n" + "InnoDB: Cannot discard the table.\n", + stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + + FILE* ef = dict_foreign_err_file; + + /* We only allow discarding a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot DISCARD table ", ef); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "table_name", name); + pars_info_add_dulint_literal(info, "new_id", new_id); + + err = que_eval_sql(info, + "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "BEGIN\n" + "SELECT ID INTO old_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "UPDATE SYS_TABLES SET ID = :new_id\n" + " WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + dict_table_change_id_in_cache(table, new_id); + + success = fil_discard_tablespace(table->space); + + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + + err = DB_ERROR; + } else { + /* Set the flag which tells that now it is legal to + IMPORT a tablespace for this table */ + table->tablespace_discarded = TRUE; + table->ibd_file_missing = TRUE; + } + } + +funct_exit: + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return((int) err); +} + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_import_tablespace_for_mysql( +/*============================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ +{ + dict_table_t* table; + ibool success; + ib_uint64_t current_lsn; + ulint err = DB_SUCCESS; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx_start_if_not_started(trx); + + trx->op_info = "importing tablespace"; + + current_lsn = log_get_lsn(); + + /* It is possible, though very improbable, that the lsn's in the + tablespace to be imported have risen above the current system lsn, if + a lengthy purge, ibuf merge, or rollback was performed on a backup + taken with ibbackup. If that is the case, reset page lsn's in the + file. We assume that mysqld was shut down after it performed these + cleanup operations on the .ibd file, so that it stamped the latest lsn + to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file. + + TODO: reset also the trx id's in clustered index records and write + a new space id to each data page. That would allow us to import clean + .ibd files from another MySQL installation. */ + + success = fil_reset_too_high_lsns(name, current_lsn); + + if (!success) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + + err = DB_ERROR; + + row_mysql_lock_data_dictionary(trx); + + goto funct_exit; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + ut_print_timestamp(stderr); + fputs(" InnoDB: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: does not exist in the InnoDB data dictionary\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: is in the system tablespace 0" + " which cannot be imported\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (!table->tablespace_discarded) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: you are trying to" + " IMPORT a tablespace\n" + "InnoDB: ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(", though you have not called DISCARD on it yet\n" + "InnoDB: during the lifetime of the mysqld process!\n", + stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + success = fil_open_single_table_tablespace( + TRUE, table->space, + table->flags == DICT_TF_COMPACT ? 0 : table->flags, + table->name); + if (success) { + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; + } else { + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fputs(" InnoDB: cannot find or open in the" + " database directory the .ibd file of\n" + "InnoDB: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + } + + err = DB_ERROR; + } + +funct_exit: + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return((int) err); +} + +/*********************************************************************//** +Truncates a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_truncate_table_for_mysql( +/*=========================*/ + dict_table_t* table, /*!< in: table handle */ + trx_t* trx) /*!< in: transaction handle */ +{ + dict_foreign_t* foreign; + ulint err; + mem_heap_t* heap; + byte* buf; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_index; + btr_pcur_t pcur; + mtr_t mtr; + dulint new_id; + ulint recreate_space = 0; + pars_info_t* info = NULL; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. + + 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive + MySQL table lock on the table before we can do TRUNCATE + TABLE. Then there are no running queries on the table. This is + guaranteed, because in ha_innobase::store_lock(), we do not + weaken the TL_WRITE lock requested by MySQL when executing + SQLCOM_TRUNCATE. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, + so we do not have to remove insert buffer records, as the + insert buffer works at a low level. If a freed page is later + reallocated, the allocator will remove the ibuf entries for + it. + + When we truncate *.ibd files by recreating them (analogous to + DISCARD TABLESPACE), we remove all entries for the table in the + insert buffer tree. This is not strictly necessary, because + in 6) we will assign a new tablespace identifier, but we can + free up some space in the system tablespace. + + 4) Linear readahead and random readahead: we use the same + method as in 3) to discard ongoing operations. (This is only + relevant for TRUNCATE TABLE by DISCARD TABLESPACE.) + + 5) FOREIGN KEY operations: if + table->n_foreign_key_checks_running > 0, we do not allow the + TRUNCATE. We also reserve the data dictionary latch. + + 6) Crash recovery: To prevent the application of pre-truncation + redo log records on the truncated tablespace, we will assign + a new tablespace identifier to the truncated tablespace. */ + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(table); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); + + return(DB_ERROR); + } + + trx->op_info = "truncating table"; + + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + ut_a(trx->dict_operation_lock_mode == 0); + /* Prevent foreign key checks etc. while we are truncating the + table */ + + row_mysql_lock_data_dictionary(trx); + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; + + /* We only allow truncating a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot truncate table ", ef); + ut_print_name(ef, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because it is referenced by ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_ERROR; + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been truncated here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot truncate table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because there is a foreign key check" + " running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); + + trx->table_id = table->id; + + if (table->space && !table->dir_path_of_temp_table) { + /* Discard and create the single-table tablespace. */ + ulint space = table->space; + ulint flags = fil_space_get_flags(space); + + if (flags != ULINT_UNDEFINED + && fil_discard_tablespace(space)) { + + dict_index_t* index; + + space = 0; + + if (fil_create_new_single_table_tablespace( + &space, table->name, FALSE, flags, + FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: TRUNCATE TABLE %s failed to" + " create a new tablespace\n", + table->name); + table->ibd_file_missing = 1; + err = DB_ERROR; + goto funct_exit; + } + + recreate_space = space; + + /* Replace the space_id in the data dictionary cache. + The persisent data dictionary (SYS_TABLES.SPACE + and SYS_INDEXES.SPACE) are updated later in this + function. */ + table->space = space; + index = dict_table_get_first_index(table); + do { + index->space = space; + index = dict_table_get_next_index(index); + } while (index); + + mtr_start(&mtr); + fsp_header_init(space, + FIL_IBD_FILE_INITIAL_SIZE, &mtr); + mtr_commit(&mtr); + } + } + + /* scan SYS_INDEXES for all indexes of the table */ + heap = mem_heap_create(800); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + sys_index = dict_table_get_first_index(dict_sys->sys_indexes); + dict_index_copy_types(tuple, sys_index, 1); + + mtr_start(&mtr); + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + for (;;) { + rec_t* rec; + const byte* field; + ulint len; + ulint root_page_no; + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* The end of SYS_INDEXES has been reached. */ + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + if (memcmp(buf, field, len) != 0) { + /* End of indexes for the table (TABLE_ID mismatch). */ + break; + } + + if (rec_get_deleted_flag(rec, FALSE)) { + /* The index has been dropped. */ + goto next_rec; + } + + /* This call may commit and restart mtr + and reposition pcur. */ + root_page_no = dict_truncate_index_tree(table, recreate_space, + &pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (root_page_no != FIL_NULL) { + page_rec_write_index_page_no( + rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, + root_page_no, &mtr); + /* We will need to commit and restart the + mini-transaction in order to avoid deadlocks. + The dict_truncate_index_tree() call has allocated + a page in this mini-transaction, and the rest of + this loop could latch another index page. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, + &pcur, &mtr); + } + +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + mem_heap_free(heap); + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + info = pars_info_create(); + + pars_info_add_int4_literal(info, "space", (lint) table->space); + pars_info_add_dulint_literal(info, "old_id", table->id); + pars_info_add_dulint_literal(info, "new_id", new_id); + + err = que_eval_sql(info, + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES" + " SET ID = :new_id, SPACE = :space\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES" + " SET TABLE_ID = :new_id, SPACE = :space\n" + " WHERE TABLE_ID = :old_id;\n" + "COMMIT WORK;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); + fputs(" InnoDB: Unable to assign a new identifier to table ", + stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: after truncating it. Background processes" + " may corrupt the table!\n", stderr); + err = DB_ERROR; + } else { + dict_table_change_id_in_cache(table, new_id); + } + + /* MySQL calls ha_innobase::reset_auto_increment() which does + the same thing. */ + dict_table_autoinc_lock(table); + dict_table_autoinc_initialize(table, 1); + dict_table_autoinc_unlock(table); + dict_update_statistics(table); + + trx_commit_for_mysql(trx); + +funct_exit: + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + srv_wake_master_thread(); + + return((int) err); +} + +/*********************************************************************//** +Drops a table for MySQL. If the name of the dropped table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also stop the printing of monitor +output by the master thread. If the data dictionary was not already locked +by the transaction, the transaction will be committed. Otherwise, the +data dictionary will remain locked. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_drop_table_for_mysql( +/*=====================*/ + const char* name, /*!< in: table name */ + trx_t* trx, /*!< in: transaction handle */ + ibool drop_db)/*!< in: TRUE=dropping whole database */ +{ + dict_foreign_t* foreign; + dict_table_t* table; + ulint space_id; + ulint err; + const char* table_name; + ulint namelen; + ibool locked_dictionary = FALSE; + pars_info_t* info = NULL; + + ut_a(name != NULL); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); + + return(DB_ERROR); + } + + trx->op_info = "dropping table"; + + trx_start_if_not_started(trx); + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = strchr(name, '/'); + ut_a(table_name); + table_name++; + namelen = strlen(table_name) + 1; + + if (namelen == sizeof S_innodb_monitor + && !memcmp(table_name, S_innodb_monitor, + sizeof S_innodb_monitor)) { + + /* Table name equals "innodb_monitor": + stop monitor prints */ + + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_lock_monitor + && !memcmp(table_name, S_innodb_lock_monitor, + sizeof S_innodb_lock_monitor)) { + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_tablespace_monitor + && !memcmp(table_name, S_innodb_tablespace_monitor, + sizeof S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = FALSE; + } else if (namelen == sizeof S_innodb_table_monitor + && !memcmp(table_name, S_innodb_table_monitor, + sizeof S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = FALSE; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + /* Prevent foreign key checks etc. while we are dropping the + table */ + + row_mysql_lock_data_dictionary(trx); + + locked_dictionary = TRUE; + } + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is" + " trying to drop it.\n" + "InnoDB: Have you copied the .frm file" + " of the table to the\n" + "InnoDB: MySQL database directory" + " from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { +check_next_foreign: + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns + && !(drop_db && dict_tables_have_same_db( + name, foreign->foreign_table_name))) { + FILE* ef = dict_foreign_err_file; + + /* We only allow dropping a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot drop table ", ef); + ut_print_name(ef, trx, TRUE, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + + if (foreign && trx->check_foreigns) { + goto check_next_foreign; + } + + if (table->n_mysql_handles_opened > 0) { + ibool added; + + added = row_add_table_to_background_drop_list(table->name); + + if (added) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: MySQL is" + " trying to drop table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: though there are still" + " open handles to it.\n" + "InnoDB: Adding the table to the" + " background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been dropped here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + + const char* table_name = table->name; + ibool added; + + added = row_add_table_to_background_drop_list(table_name); + + if (added) { + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to drop table ", + stderr); + ut_print_name(stderr, trx, TRUE, table_name); + fputs("\n" + "InnoDB: though there is a" + " foreign key check running on it.\n" + "InnoDB: Adding the table to" + " the background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* Remove all locks there are on the table or its records */ + lock_remove_all_on_table(table, TRUE); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = table->id; + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "table_name", name); + + err = que_eval_sql(info, + "PROCEDURE DROP_TABLE_PROC () IS\n" + "sys_foreign_id CHAR;\n" + "table_id CHAR;\n" + "index_id CHAR;\n" + "foreign_id CHAR;\n" + "found INT;\n" + "BEGIN\n" + "SELECT ID INTO table_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " RETURN;\n" + "END IF;\n" + "found := 1;\n" + "SELECT ID INTO sys_foreign_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = 'SYS_FOREIGN'\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (:table_name = 'SYS_FOREIGN') THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n" + " found := 0;\n" + "END IF;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = :table_name\n" + " AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:table_name)\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FOREIGN_COLS\n" + " WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO index_id\n" + " FROM SYS_INDEXES\n" + " WHERE TABLE_ID = table_id\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS\n" + " WHERE INDEX_ID = index_id;\n" + " DELETE FROM SYS_INDEXES\n" + " WHERE ID = index_id\n" + " AND TABLE_ID = table_id;\n" + " END IF;\n" + "END LOOP;\n" + "DELETE FROM SYS_COLUMNS\n" + "WHERE TABLE_ID = table_id;\n" + "DELETE FROM SYS_TABLES\n" + "WHERE ID = table_id;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + ut_a(err == DB_OUT_OF_FILE_SPACE); + + err = DB_MUST_GET_MORE_FILE_SPACE; + + row_mysql_handle_errors(&err, trx, NULL, NULL); + + ut_error; + } else { + ibool is_path; + const char* name_or_path; + mem_heap_t* heap; + + heap = mem_heap_create(200); + + /* Clone the name, in case it has been allocated + from table->heap, which will be freed by + dict_table_remove_from_cache(table) below. */ + name = mem_heap_strdup(heap, name); + space_id = table->space; + + if (table->dir_path_of_temp_table != NULL) { + is_path = TRUE; + name_or_path = mem_heap_strdup( + heap, table->dir_path_of_temp_table); + } else { + is_path = FALSE; + name_or_path = name; + } + + dict_table_remove_from_cache(table); + + if (dict_load_table(name) != NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: not able to remove table ", + stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" from the dictionary cache!\n", stderr); + err = DB_ERROR; + } + + /* Do not drop possible .ibd tablespace if something went + wrong: we do not want to delete valuable data of the user */ + + if (err == DB_SUCCESS && space_id > 0) { + if (!fil_space_for_table_exists_in_mem(space_id, + name_or_path, + is_path, + FALSE, TRUE)) { + err = DB_SUCCESS; + + fprintf(stderr, + "InnoDB: We removed now the InnoDB" + " internal data dictionary entry\n" + "InnoDB: of table "); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, ".\n"); + } else if (!fil_delete_tablespace(space_id)) { + fprintf(stderr, + "InnoDB: We removed now the InnoDB" + " internal data dictionary entry\n" + "InnoDB: of table "); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, ".\n"); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: not able to" + " delete tablespace %lu of table ", + (ulong) space_id); + ut_print_name(stderr, trx, TRUE, name); + fputs("!\n", stderr); + err = DB_ERROR; + } + } + + mem_heap_free(heap); + } +funct_exit: + + if (locked_dictionary) { + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + } + + trx->op_info = ""; + + srv_wake_master_thread(); + + return((int) err); +} + +/*******************************************************************//** +Drop all foreign keys in a database, see Bug#18942. +Called at the end of row_drop_database_for_mysql(). +@return error code or DB_SUCCESS */ +static +ulint +drop_all_foreign_keys_in_db( +/*========================*/ + const char* name, /*!< in: database name which ends to '/' */ + trx_t* trx) /*!< in: transaction handle */ +{ + pars_info_t* pinfo; + ulint err; + + ut_a(name[strlen(name) - 1] == '/'); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "dbname", name); + +/** true if for_name is not prefixed with dbname */ +#define TABLE_NOT_IN_THIS_DB \ +"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname" + + err = que_eval_sql(pinfo, + "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n" + "foreign_id CHAR;\n" + "for_name CHAR;\n" + "found INT;\n" + "DECLARE CURSOR cur IS\n" + "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n" + "WHERE FOR_NAME >= :dbname\n" + "LOCK IN SHARE MODE\n" + "ORDER BY FOR_NAME;\n" + "BEGIN\n" + "found := 1;\n" + "OPEN cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH cur INTO foreign_id, for_name;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n" + " found := 0;\n" + " ELSIF (1=1) THEN\n" + " DELETE FROM SYS_FOREIGN_COLS\n" + " WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE cur;\n" + "COMMIT WORK;\n" + "END;\n", + FALSE, /* do not reserve dict mutex, + we are already holding it */ + trx); + + return(err); +} + +/*********************************************************************//** +Drops a database for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +int +row_drop_database_for_mysql( +/*========================*/ + const char* name, /*!< in: database name which ends to '/' */ + trx_t* trx) /*!< in: transaction handle */ +{ + dict_table_t* table; + char* table_name; + int err = DB_SUCCESS; + ulint namelen = strlen(name); + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(name != NULL); + ut_a(name[namelen - 1] == '/'); + + trx->op_info = "dropping database"; + + trx_start_if_not_started(trx); +loop: + row_mysql_lock_data_dictionary(trx); + + while ((table_name = dict_get_first_table_name_in_db(name))) { + ut_a(memcmp(table_name, name, namelen) == 0); + + table = dict_table_get_low(table_name); + + ut_a(table); + + /* Wait until MySQL does not have any queries running on + the table */ + + if (table->n_mysql_handles_opened > 0) { + row_mysql_unlock_data_dictionary(trx); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: MySQL is trying to" + " drop database ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: though there are still" + " open handles to table ", stderr); + ut_print_name(stderr, trx, TRUE, table_name); + fputs(".\n", stderr); + + os_thread_sleep(1000000); + + mem_free(table_name); + + goto loop; + } + + err = row_drop_table_for_mysql(table_name, trx, TRUE); + trx_commit_for_mysql(trx); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, " failed with error %lu for table ", + (ulint) err); + ut_print_name(stderr, trx, TRUE, table_name); + putc('\n', stderr); + mem_free(table_name); + break; + } + + mem_free(table_name); + } + + if (err == DB_SUCCESS) { + /* after dropping all tables try to drop all leftover + foreign keys in case orphaned ones exist */ + err = (int) drop_all_foreign_keys_in_db(name, trx); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, " failed with error %d while " + "dropping all foreign keys", err); + } + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return TRUE if temporary table */ +static +ibool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) /*!< in: table name in the form + 'database/tablename' */ +{ + return(strstr(name, "/#sql") != NULL); + /* return(strstr(name, "/@0023sql") != NULL); */ +} + +/****************************************************************//** +Delete a single constraint. +@return error code or DB_SUCCESS */ +static +int +row_delete_constraint_low( +/*======================*/ + const char* id, /*!< in: constraint id */ + trx_t* trx) /*!< in: transaction handle */ +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", id); + + return((int) que_eval_sql(info, + "PROCEDURE DELETE_CONSTRAINT () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n" + "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n" + "END;\n" + , FALSE, trx)); +} + +/****************************************************************//** +Delete a single constraint. +@return error code or DB_SUCCESS */ +static +int +row_delete_constraint( +/*==================*/ + const char* id, /*!< in: constraint id */ + const char* database_name, /*!< in: database name, with the + trailing '/' */ + mem_heap_t* heap, /*!< in: memory heap */ + trx_t* trx) /*!< in: transaction handle */ +{ + ulint err; + + /* New format constraints have ids <databasename>/<constraintname>. */ + err = row_delete_constraint_low( + mem_heap_strcat(heap, database_name, id), trx); + + if ((err == DB_SUCCESS) && !strchr(id, '/')) { + /* Old format < 4.0.18 constraints have constraint ids + <number>_<number>. We only try deleting them if the + constraint name does not contain a '/' character, otherwise + deleting a new format constraint named 'foo/bar' from + database 'baz' would remove constraint 'bar' from database + 'foo', if it existed. */ + + err = row_delete_constraint_low(id, trx); + } + + return((int) err); +} + +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +ulint +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in: transaction handle */ + ibool commit) /*!< in: if TRUE then commit trx */ +{ + dict_table_t* table; + ulint err = DB_ERROR; + mem_heap_t* heap = NULL; + const char** constraints_to_drop = NULL; + ulint n_constraints_to_drop = 0; + ibool old_is_tmp, new_is_tmp; + pars_info_t* info = NULL; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(old_name != NULL); + ut_a(new_name != NULL); + + if (srv_created_new_raw || srv_force_recovery) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw" + " is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + goto funct_exit; + } else if (row_mysql_is_system_table(new_name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL" + " system table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be" + " of the MyISAM type!\n", + new_name); + + goto funct_exit; + } + + trx->op_info = "renaming table"; + trx_start_if_not_started(trx); + + old_is_tmp = row_is_mysql_tmp_table_name(old_name); + new_is_tmp = row_is_mysql_tmp_table_name(new_name); + + table = dict_table_get_low(old_name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is" + " trying to rename the table.\n" + "InnoDB: Have you copied the .frm file" + " of the table to the\n" + "InnoDB: MySQL database directory" + " from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } else if (table->ibd_file_missing) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" does not have an .ibd file" + " in the database directory.\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } else if (new_is_tmp) { + /* MySQL is doing an ALTER TABLE command and it renames the + original table to a temporary table name. We want to preserve + the original foreign key constraint definitions despite the + name change. An exception is those constraints for which + the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/ + + heap = mem_heap_create(100); + + err = dict_foreign_parse_drop_constraints( + heap, trx, table, &n_constraints_to_drop, + &constraints_to_drop); + + if (err != DB_SUCCESS) { + + goto funct_exit; + } + } + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data from system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET NAME = :new_table_name\n" + " WHERE NAME = :old_table_name;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + + goto end; + } else if (!new_is_tmp) { + /* Rename all constraints. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + + err = que_eval_sql( + info, + "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n" + "gen_constr_prefix CHAR;\n" + "new_db_name CHAR;\n" + "foreign_id CHAR;\n" + "new_foreign_id CHAR;\n" + "old_db_name_len INT;\n" + "old_t_name_len INT;\n" + "new_db_name_len INT;\n" + "id_len INT;\n" + "found INT;\n" + "BEGIN\n" + "found := 1;\n" + "old_db_name_len := INSTR(:old_table_name, '/')-1;\n" + "new_db_name_len := INSTR(:new_table_name, '/')-1;\n" + "new_db_name := SUBSTR(:new_table_name, 0,\n" + " new_db_name_len);\n" + "old_t_name_len := LENGTH(:old_table_name);\n" + "gen_constr_prefix := CONCAT(:old_table_name,\n" + " '_ibfk_');\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = :old_table_name\n" + " AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:old_table_name)\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " UPDATE SYS_FOREIGN\n" + " SET FOR_NAME = :new_table_name\n" + " WHERE ID = foreign_id;\n" + " id_len := LENGTH(foreign_id);\n" + " IF (INSTR(foreign_id, '/') > 0) THEN\n" + " IF (INSTR(foreign_id,\n" + " gen_constr_prefix) > 0)\n" + " THEN\n" + " new_foreign_id :=\n" + " CONCAT(:new_table_name,\n" + " SUBSTR(foreign_id, old_t_name_len,\n" + " id_len - old_t_name_len));\n" + " ELSE\n" + " new_foreign_id :=\n" + " CONCAT(new_db_name,\n" + " SUBSTR(foreign_id,\n" + " old_db_name_len,\n" + " id_len - old_db_name_len));\n" + " END IF;\n" + " UPDATE SYS_FOREIGN\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " UPDATE SYS_FOREIGN_COLS\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n" + "WHERE REF_NAME = :old_table_name\n" + " AND TO_BINARY(REF_NAME)\n" + " = TO_BINARY(:old_table_name);\n" + "END;\n" + , FALSE, trx); + + } else if (n_constraints_to_drop > 0) { + /* Drop some constraints of tmp tables. */ + + ulint db_name_len = dict_get_db_name_len(old_name) + 1; + char* db_name = mem_heap_strdupl(heap, old_name, + db_name_len); + ulint i; + + for (i = 0; i < n_constraints_to_drop; i++) { + err = row_delete_constraint(constraints_to_drop[i], + db_name, heap, trx); + + if (err != DB_SUCCESS) { + break; + } + } + } + +end: + if (err != DB_SUCCESS) { + if (err == DB_DUPLICATE_KEY) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error; possible reasons:\n" + "InnoDB: 1) Table rename would cause" + " two FOREIGN KEY constraints\n" + "InnoDB: to have the same internal name" + " in case-insensitive comparison.\n" + "InnoDB: 2) table ", stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs(" exists in the InnoDB internal data\n" + "InnoDB: dictionary though MySQL is" + " trying to rename table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" to it.\n" + "InnoDB: Have you deleted the .frm file" + " and not used DROP TABLE?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: If table ", stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs(" is a temporary table #sql..., then" + " it can be that\n" + "InnoDB: there are still queries running" + " on the table, and it will be\n" + "InnoDB: dropped automatically when" + " the queries end.\n" + "InnoDB: You can drop the orphaned table" + " inside InnoDB by\n" + "InnoDB: creating an InnoDB table with" + " the same name in another\n" + "InnoDB: database and copying the .frm file" + " to the current database.\n" + "InnoDB: Then MySQL thinks the table exists," + " and DROP TABLE will\n" + "InnoDB: succeed.\n", stderr); + } + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + /* The following call will also rename the .ibd data file if + the table is stored in a single-table tablespace */ + + if (!dict_table_rename_in_cache(table, new_name, + !new_is_tmp)) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + goto funct_exit; + } + + /* We only want to switch off some of the type checking in + an ALTER, not in a RENAME. */ + + err = dict_load_foreigns( + new_name, !old_is_tmp || trx->check_foreigns); + + if (err != DB_SUCCESS) { + ut_print_timestamp(stderr); + + if (old_is_tmp) { + fputs(" InnoDB: Error: in ALTER TABLE ", + stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs("\n" + "InnoDB: has or is referenced" + " in foreign key constraints\n" + "InnoDB: which are not compatible" + " with the new table definition.\n", + stderr); + } else { + fputs(" InnoDB: Error: in RENAME TABLE" + " table ", + stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs("\n" + "InnoDB: is referenced in" + " foreign key constraints\n" + "InnoDB: which are not compatible" + " with the new table definition.\n", + stderr); + } + + ut_a(dict_table_rename_in_cache(table, + old_name, FALSE)); + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } + } + +funct_exit: + + if (commit) { + trx_commit_for_mysql(trx); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Checks that the index contains entries in an ascending order, unique +constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. +@return TRUE if ok */ +static +ibool +row_scan_and_check_index( +/*=====================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in MySQL */ + dict_index_t* index, /*!< in: index */ + ulint* n_rows) /*!< out: number of entries seen in the + current consistent read */ +{ + dtuple_t* prev_entry = NULL; + ulint matched_fields; + ulint matched_bytes; + byte* buf; + ulint ret; + rec_t* rec; + ibool is_ok = TRUE; + int cmp; + ibool contains_null; + ulint i; + ulint cnt; + mem_heap_t* heap = NULL; + ulint n_ext; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + *n_rows = 0; + + if (!row_merge_is_index_usable(prebuilt->trx, index)) { + /* A newly created index may lack some delete-marked + records that may exist in the read view of + prebuilt->trx. Thus, such indexes must not be + accessed by consistent read. */ + return(is_ok); + } + + buf = mem_alloc(UNIV_PAGE_SIZE); + heap = mem_heap_create(100); + + /* Make a dummy template in prebuilt, which we will use + in scanning the index entries */ + + prebuilt->index = index; + /* row_merge_is_index_usable() was already checked above. */ + prebuilt->index_usable = TRUE; + prebuilt->sql_stat_start = TRUE; + prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE; + prebuilt->n_template = 0; + prebuilt->need_to_access_clustered = FALSE; + + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + prebuilt->select_lock_type = LOCK_NONE; + cnt = 1000; + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0); +loop: + /* Check thd->killed every 1,000 scanned rows */ + if (--cnt == 0) { + if (trx_is_interrupted(prebuilt->trx)) { + goto func_exit; + } + cnt = 1000; + } + + switch (ret) { + case DB_SUCCESS: + break; + default: + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: CHECK TABLE on ", stderr); + dict_index_name_print(stderr, prebuilt->trx, index); + fprintf(stderr, " returned %lu\n", ret); + /* fall through (this error is ignored by CHECK TABLE) */ + case DB_END_OF_INDEX: +func_exit: + mem_free(buf); + mem_heap_free(heap); + + return(is_ok); + } + + *n_rows = *n_rows + 1; + + /* row_search... returns the index record in buf, record origin offset + within buf stored in the first 4 bytes, because we have built a dummy + template */ + + rec = buf + mach_read_from_4(buf); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (prev_entry != NULL) { + matched_fields = 0; + matched_bytes = 0; + + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, + &matched_fields, + &matched_bytes); + contains_null = FALSE; + + /* In a unique secondary index we allow equal key values if + they contain SQL NULLs */ + + for (i = 0; + i < dict_index_get_n_ordering_defined_by_user(index); + i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(prev_entry, i))) { + + contains_null = TRUE; + } + } + + if (cmp > 0) { + fputs("InnoDB: index records in a wrong order in ", + stderr); +not_ok: + dict_index_name_print(stderr, + prebuilt->trx, index); + fputs("\n" + "InnoDB: prev record ", stderr); + dtuple_print(stderr, prev_entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + is_ok = FALSE; + } else if (dict_index_is_unique(index) + && !contains_null + && matched_fields + >= dict_index_get_n_ordering_defined_by_user( + index)) { + + fputs("InnoDB: duplicate key in ", stderr); + goto not_ok; + } + } + + { + mem_heap_t* tmp_heap = NULL; + + /* Empty the heap on each round. But preserve offsets[] + for the row_rec_to_index_entry() call, by copying them + into a separate memory heap when needed. */ + if (UNIV_UNLIKELY(offsets != offsets_)) { + ulint size = rec_offs_get_n_alloc(offsets) + * sizeof *offsets; + + tmp_heap = mem_heap_create(size); + offsets = mem_heap_dup(tmp_heap, offsets, size); + } + + mem_heap_empty(heap); + + prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, + index, offsets, + &n_ext, heap); + + if (UNIV_LIKELY_NULL(tmp_heap)) { + mem_heap_free(tmp_heap); + } + } + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); + + goto loop; +} + +/*********************************************************************//** +Checks a table for corruption. +@return DB_ERROR or DB_SUCCESS */ +UNIV_INTERN +ulint +row_check_table_for_mysql( +/*======================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + dict_index_t* index; + ulint n_rows; + ulint n_rows_in_table = ULINT_UNDEFINED; + ulint ret = DB_SUCCESS; + ulint old_isolation_level; + + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + table->name); + return(DB_ERROR); + } + + prebuilt->trx->op_info = "checking table"; + + old_isolation_level = prebuilt->trx->isolation_level; + + /* We must run the index record counts at an isolation level + >= READ COMMITTED, because a dirty read can see a wrong number + of records in some index; to play safe, we use always + REPEATABLE READ here */ + + prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + /* Enlarge the fatal lock wait timeout during CHECK TABLE. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + index = dict_table_get_first_index(table); + + while (index != NULL) { + /* fputs("Validating index ", stderr); + ut_print_name(stderr, trx, FALSE, index->name); + putc('\n', stderr); */ + + if (!btr_validate_index(index, prebuilt->trx)) { + ret = DB_ERROR; + } else { + if (!row_scan_and_check_index(prebuilt,index, &n_rows)){ + ret = DB_ERROR; + } + + if (trx_is_interrupted(prebuilt->trx)) { + break; + } + + /* fprintf(stderr, "%lu entries in index %s\n", n_rows, + index->name); */ + + if (index == dict_table_get_first_index(table)) { + n_rows_in_table = n_rows; + } else if (n_rows != n_rows_in_table) { + + ret = DB_ERROR; + + fputs("Error: ", stderr); + dict_index_name_print(stderr, + prebuilt->trx, index); + fprintf(stderr, + " contains %lu entries," + " should be %lu\n", + (ulong) n_rows, + (ulong) n_rows_in_table); + } + } + + index = dict_table_get_next_index(index); + } + + /* Restore the original isolation level */ + prebuilt->trx->isolation_level = old_isolation_level; + + /* We validate also the whole adaptive hash index for all tables + at every CHECK TABLE */ + + if (!btr_search_validate()) { + + ret = DB_ERROR; + } + + /* Restore the fatal lock wait timeout after CHECK TABLE. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + prebuilt->trx->op_info = ""; + + return(ret); +} + +/*********************************************************************//** +Determines if a table is a magic monitor table. +@return TRUE if monitor table */ +UNIV_INTERN +ibool +row_is_magic_monitor_table( +/*=======================*/ + const char* table_name) /*!< in: name of the table, in the + form database/table_name */ +{ + const char* name; /* table_name without database/ */ + ulint len; + + name = strchr(table_name, '/'); + ut_a(name != NULL); + name++; + len = strlen(name) + 1; + + if (STR_EQ(name, len, S_innodb_monitor) + || STR_EQ(name, len, S_innodb_lock_monitor) + || STR_EQ(name, len, S_innodb_tablespace_monitor) + || STR_EQ(name, len, S_innodb_table_monitor) + || STR_EQ(name, len, S_innodb_mem_validate)) { + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.c new file mode 100644 index 00000000000..500ebe571ab --- /dev/null +++ b/storage/innobase/row/row0purge.c @@ -0,0 +1,689 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0purge.c +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#include "row0purge.h" + +#ifdef UNIV_NONINL +#include "row0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0vers.h" +#include "row0mysql.h" +#include "log0log.h" + +/********************************************************************//** +Creates a purge node to a query graph. +@return own: purge node */ +UNIV_INTERN +purge_node_t* +row_purge_node_create( +/*==================*/ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + purge_node_t* node; + + ut_ad(parent && heap); + + node = mem_heap_alloc(heap, sizeof(purge_node_t)); + + node->common.type = QUE_NODE_PURGE; + node->common.parent = parent; + + node->heap = mem_heap_create(256); + + return(node); +} + +/***********************************************************//** +Repositions the pcur in the purge node on the clustered index record, +if found. +@return TRUE if the record was found */ +static +ibool +row_purge_reposition_pcur( +/*======================*/ + ulint mode, /*!< in: latching mode */ + purge_node_t* node, /*!< in: row purge node */ + mtr_t* mtr) /*!< in: mtr */ +{ + ibool found; + + if (node->found_clust) { + found = btr_pcur_restore_position(mode, &(node->pcur), mtr); + + return(found); + } + + found = row_search_on_row_ref(&(node->pcur), mode, node->table, + node->ref, mtr); + node->found_clust = found; + + if (found) { + btr_pcur_store_position(&(node->pcur), mtr); + } + + return(found); +} + +/***********************************************************//** +Removes a delete marked clustered index record if possible. +@return TRUE if success, or if not found, or if modified after the +delete marking */ +static +ibool +row_purge_remove_clust_if_poss_low( +/*===============================*/ + purge_node_t* node, /*!< in: row purge node */ + ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ibool success; + ulint err; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + index = dict_table_get_first_index(node->table); + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + mtr_start(&mtr); + + success = row_purge_reposition_pcur(mode, node, &mtr); + + if (!success) { + /* The record is already removed */ + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + rec = btr_pcur_get_rec(pcur); + + if (0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr( + rec, index, rec_get_offsets( + rec, index, offsets_, + ULINT_UNDEFINED, &heap)))) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + /* Someone else has modified the record later: do not remove */ + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NONE, &mtr); + + if (err == DB_SUCCESS) { + success = TRUE; + } else if (err == DB_OUT_OF_FILE_SPACE) { + success = FALSE; + } else { + ut_error; + } + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(success); +} + +/***********************************************************//** +Removes a clustered index record if it has not been modified after the delete +marking. */ +static +void +row_purge_remove_clust_if_poss( +/*===========================*/ + purge_node_t* node) /*!< in: row purge node */ +{ + ibool success; + ulint n_tries = 0; + + /* fputs("Purge: Removing clustered record\n", stderr); */ + + success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/***********************************************************//** +Removes a secondary index entry if possible. +@return TRUE if success or if not found */ +static +ibool +row_purge_remove_sec_if_poss_low( +/*=============================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: index entry */ + ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has = 0; /* remove warning */ + ibool found; + ulint err; + mtr_t mtr; + mtr_t mtr_vers; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + if (!found) { + /* Not found. This is a legitimate condition. In a + rollback, InnoDB will remove secondary recs that would + be purged anyway. Then the actual purge will not find + the secondary index record. Also, the purge itself is + eager: if it comes to consider a secondary index + record, and notices it does not need to exist in the + index, it will remove it. Then if/when the purge + comes to consider the secondary index record a second + time, it will not exist any more in the index. */ + + /* fputs("PURGE:........sec entry not found\n", stderr); */ + /* dtuple_print(stderr, entry); */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(TRUE); + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* We should remove the index record if no later version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should do nothing. */ + + mtr_start(&mtr_vers); + + success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers); + + if (success) { + old_has = row_vers_old_has_index_entry( + TRUE, btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + + if (!success || !old_has) { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NONE, &mtr); + success = err == DB_SUCCESS; + ut_a(success || err == DB_OUT_OF_FILE_SPACE); + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(success); +} + +/***********************************************************//** +Removes a secondary index entry if possible. */ +UNIV_INLINE +void +row_purge_remove_sec_if_poss( +/*=========================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + ibool success; + ulint n_tries = 0; + + /* fputs("Purge: Removing secondary record\n", stderr); */ + + success = row_purge_remove_sec_if_poss_low(node, index, entry, + BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_sec_if_poss_low(node, index, entry, + BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/***********************************************************//** +Purges a delete marking of a record. */ +static +void +row_purge_del_mark( +/*===============*/ + purge_node_t* node) /*!< in: row purge node */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + + ut_ad(node); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + /* Build the index entry */ + entry = row_build_index_entry(node->row, NULL, index, heap); + ut_a(entry); + row_purge_remove_sec_if_poss(node, index, entry); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + row_purge_remove_clust_if_poss(node); +} + +/***********************************************************//** +Purges an update of an existing record. Also purges an update of a delete +marked record if that record contained an externally stored field. */ +static +void +row_purge_upd_exist_or_extern( +/*==========================*/ + purge_node_t* node) /*!< in: row purge node */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + ulint i; + mtr_t mtr; + + ut_ad(node); + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + goto skip_secondaries; + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field_binary(NULL, node->index, + node->update)) { + /* Build the older version of the index entry */ + entry = row_build_index_entry(node->row, NULL, + index, heap); + ut_a(entry); + row_purge_remove_sec_if_poss(node, index, entry); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + +skip_secondaries: + /* Free possible externally stored fields */ + for (i = 0; i < upd_get_n_fields(node->update); i++) { + + const upd_field_t* ufield + = upd_get_nth_field(node->update, i); + + if (dfield_is_ext(&ufield->new_val)) { + buf_block_t* block; + ulint internal_offset; + byte* data_field; + + /* We use the fact that new_val points to + node->undo_rec and get thus the offset of + dfield data inside the undo record. Then we + can calculate from node->roll_ptr the file + address of the new_val data */ + + internal_offset + = ((const byte*) + dfield_get_data(&ufield->new_val)) + - node->undo_rec; + + ut_a(internal_offset < UNIV_PAGE_SIZE); + + trx_undo_decode_roll_ptr(node->roll_ptr, + &is_insert, &rseg_id, + &page_no, &offset); + mtr_start(&mtr); + + /* We have to acquire an X-latch to the clustered + index tree */ + + index = dict_table_get_first_index(node->table); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + /* NOTE: we must also acquire an X-latch to the + root page of the tree. We will need it when we + free pages from the tree. If the tree is of height 1, + the tree X-latch does NOT protect the root page, + because it is also a leaf page. Since we will have a + latch on an undo log page, we would break the + latching order if we would only later latch the + root page of such a tree! */ + + btr_root_get(index, &mtr); + + /* We assume in purge of externally stored fields + that the space id of the undo log record is 0! */ + + block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + data_field = buf_block_get_frame(block) + + offset + internal_offset; + + ut_a(dfield_get_len(&ufield->new_val) + >= BTR_EXTERN_FIELD_REF_SIZE); + btr_free_externally_stored_field( + index, + data_field + dfield_get_len(&ufield->new_val) + - BTR_EXTERN_FIELD_REF_SIZE, + NULL, NULL, NULL, 0, RB_NONE, &mtr); + mtr_commit(&mtr); + } + } +} + +/***********************************************************//** +Parses the row reference and other info in a modify undo log record. +@return TRUE if purge operation required: NOTE that then the CALLER +must unfreeze data dictionary! */ +static +ibool +row_purge_parse_undo_rec( +/*=====================*/ + purge_node_t* node, /*!< in: row undo node */ + ibool* updated_extern, + /*!< out: TRUE if an externally stored field + was updated */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + trx_t* trx; + undo_no_t undo_no; + dulint table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + + ut_ad(node && thr); + + trx = thr_get_trx(thr); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + updated_extern, &undo_no, &table_id); + node->rec_type = type; + + if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) { + + return(FALSE); + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + node->table = NULL; + + if (type == TRX_UNDO_UPD_EXIST_REC + && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) { + + /* Purge requires no changes to indexes: we may return */ + + return(FALSE); + } + + /* Prevent DROP TABLE etc. from running when we are doing the purge + for this row */ + + row_mysql_freeze_data_dictionary(trx); + + mutex_enter(&(dict_sys->mutex)); + + node->table = dict_table_get_on_id_low(table_id); + + mutex_exit(&(dict_sys->mutex)); + + if (node->table == NULL) { + /* The table has been dropped: no need to do purge */ +err_exit: + row_mysql_unfreeze_data_dictionary(trx); + return(FALSE); + } + + if (node->table->ibd_file_missing) { + /* We skip purge of missing .ibd files */ + + node->table = NULL; + + goto err_exit; + } + + clust_index = dict_table_get_first_index(node->table); + + if (clust_index == NULL) { + /* The table was corrupt in the data dictionary */ + + goto err_exit; + } + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + + /* Read to the partial row the fields that occur in indexes */ + + if (!(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + ptr = trx_undo_rec_get_partial_row( + ptr, clust_index, &node->row, + type == TRX_UNDO_UPD_DEL_REC, + node->heap); + } + + return(TRUE); +} + +/***********************************************************//** +Fetches an undo log record and does the purge for the recorded operation. +If none left, or the current purge completed, returns the control to the +parent node, which is always a query thread node. +@return DB_SUCCESS if operation successfully completed, else error code */ +static +ulint +row_purge( +/*======*/ + purge_node_t* node, /*!< in: row purge node */ + que_thr_t* thr) /*!< in: query thread */ +{ + roll_ptr_t roll_ptr; + ibool purge_needed; + ibool updated_extern; + trx_t* trx; + + ut_ad(node && thr); + + trx = thr_get_trx(thr); + + node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr, + &(node->reservation), + node->heap); + if (!node->undo_rec) { + /* Purge completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + + if (node->undo_rec == &trx_purge_dummy_rec) { + purge_needed = FALSE; + } else { + purge_needed = row_purge_parse_undo_rec(node, &updated_extern, + thr); + /* If purge_needed == TRUE, we must also remember to unfreeze + data dictionary! */ + } + + if (purge_needed) { + node->found_clust = FALSE; + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + row_purge_del_mark(node); + + } else if (updated_extern + || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + row_purge_upd_exist_or_extern(node); + } + + if (node->found_clust) { + btr_pcur_close(&(node->pcur)); + } + + row_mysql_unfreeze_data_dictionary(trx); + } + + /* Do some cleanup */ + trx_purge_rec_release(node->reservation); + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(DB_SUCCESS); +} + +/***********************************************************//** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + purge_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + + err = row_purge(node, thr); + + ut_ad(err == DB_SUCCESS); + + return(thr); +} diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c new file mode 100644 index 00000000000..128ac3ba3e8 --- /dev/null +++ b/storage/innobase/row/row0row.c @@ -0,0 +1,1168 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0row.c +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" + +#ifdef UNIV_NONINL +#include "row0row.ic" +#endif + +#include "data0type.h" +#include "dict0dict.h" +#include "btr0btr.h" +#include "ha_prototypes.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "ut0mem.h" + +/*********************************************************************//** +Gets the offset of trx id field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INTERN +ulint +row_get_trx_id_offset( +/*==================*/ + const rec_t* rec __attribute__((unused)), + /*!< in: record */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint pos; + ulint offset; + ulint len; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offset = rec_get_nth_field_offs(offsets, pos, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INTERN +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + row_ext_t* ext, /*!< in: externally stored column prefixes, + or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which the memory for + the index entry is allocated */ +{ + dtuple_t* entry; + ulint entry_len; + ulint i; + + ut_ad(row && index && heap); + ut_ad(dtuple_check_typed(row)); + + entry_len = dict_index_get_n_fields(index); + entry = dtuple_create(heap, entry_len); + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + dtuple_set_n_fields_cmp(entry, entry_len); + /* There may only be externally stored columns + in a clustered index B-tree of a user table. */ + ut_a(!ext); + } else { + dtuple_set_n_fields_cmp( + entry, dict_index_get_n_unique_in_tree(index)); + } + + for (i = 0; i < entry_len; i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = ind_field->col; + ulint col_no + = dict_col_get_no(col); + dfield_t* dfield + = dtuple_get_nth_field(entry, i); + const dfield_t* dfield2 + = dtuple_get_nth_field(row, col_no); + ulint len + = dfield_get_len(dfield2); + + dfield_copy(dfield, dfield2); + + if (dfield_is_null(dfield) || ind_field->prefix_len == 0) { + continue; + } + + /* If a column prefix index, take only the prefix. + Prefix-indexed columns may be externally stored. */ + ut_ad(col->ord_part); + + if (UNIV_LIKELY_NULL(ext)) { + /* See if the column is stored externally. */ + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + if (UNIV_UNLIKELY(buf == field_ref_zero)) { + return(NULL); + } + dfield_set_data(dfield, buf, len); + } + } else if (dfield_is_ext(dfield)) { + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + len -= BTR_EXTERN_FIELD_REF_SIZE; + ut_a(ind_field->prefix_len <= len + || dict_index_is_clust(index)); + } + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ind_field->prefix_len, len, dfield_get_data(dfield)); + dfield_set_len(dfield, len); + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* row; + const dict_table_t* table; + ulint n_fields; + ulint n_ext_cols; + ulint* ext_cols = NULL; /* remove warning */ + ulint len; + ulint row_len; + byte* buf; + ulint i; + ulint j; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_ad(index && rec && heap); + ut_ad(dict_index_is_clust(index)); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + if (type != ROW_COPY_POINTERS) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, (ulint*) offsets); + } + + table = index->table; + row_len = dict_table_get_n_cols(table); + + row = dtuple_create(heap, row_len); + + dict_table_copy_types(row, table); + + dtuple_set_info_bits(row, rec_get_info_bits( + rec, dict_table_is_comp(table))); + + n_fields = rec_offs_n_fields(offsets); + n_ext_cols = rec_offs_n_extern(offsets); + if (n_ext_cols) { + ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols); + } + + for (i = j = 0; i < n_fields; i++) { + dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(ind_field); + ulint col_no + = dict_col_get_no(col); + dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + + if (ind_field->prefix_len == 0) { + + const byte* field = rec_get_nth_field( + rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + } + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + + if (UNIV_LIKELY_NULL(col_table)) { + ut_a(col_no + < dict_table_get_n_cols(col_table)); + col = dict_table_get_nth_col( + col_table, col_no); + } + + if (col->ord_part) { + /* We will have to fetch prefixes of + externally stored columns that are + referenced by column prefixes. */ + ext_cols[j++] = col_no; + } + } + } + + ut_ad(dtuple_check_typed(row)); + + if (j) { + *ext = row_ext_create(j, ext_cols, row, + dict_table_zip_size(index->table), + heap); + } else { + *ext = NULL; + } + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(row); +} + +/*******************************************************************//** +Converts an index record to a typed data tuple. +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + ulint i; + const byte* field; + ulint len; + ulint rec_len; + + ut_ad(rec && heap && index); + /* Because this function may be invoked by row0merge.c + on a record whose header is in different format, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + ut_ad(n_ext); + *n_ext = 0; + + rec_len = rec_offs_n_fields(offsets); + + entry = dtuple_create(heap, rec_len); + + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + ut_ad(rec_len == dict_index_get_n_fields(index)); + + dict_index_copy_types(entry, index, rec_len); + + for (i = 0; i < rec_len; i++) { + + dfield = dtuple_get_nth_field(entry, i); + field = rec_get_nth_field(rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + (*n_ext)++; + } + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + ulint type, /*!< in: ROW_COPY_DATA, or + ROW_COPY_POINTERS: the former + copies also the data fields to + heap as the latter only places + pointers to data fields on the + index page */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the dtuple is used! */ + const dict_index_t* index, /*!< in: index */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* entry; + byte* buf; + + ut_ad(rec && heap && index); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); + + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); + + return(entry); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dtuple_t* ref; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + byte* buf; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index && rec && heap); + ut_ad(!dict_index_is_clust(index)); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminlen, + dtype->mbmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(ref); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + ulint* offsets,/*!< in: rec_get_offsets(rec, index) + or NULL */ + trx_t* trx) /*!< in: transaction */ +{ + const dict_index_t* clust_index; + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_a(ref); + ut_a(index); + ut_a(rec); + ut_ad(!dict_index_is_clust(index)); + + if (UNIV_UNLIKELY(!index->table)) { + fputs("InnoDB: table ", stderr); +notfound: + ut_print_name(stderr, trx, TRUE, index->table_name); + fputs(" for index ", stderr); + ut_print_name(stderr, trx, FALSE, index->name); + fputs(" not found\n", stderr); + ut_error; + } + + clust_index = dict_table_get_first_index(index->table); + + if (UNIV_UNLIKELY(!clust_index)) { + fputs("InnoDB: clust index for table ", stderr); + goto notfound; + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminlen, + dtype->mbmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************//** +Searches the clustered index record for a row, if we have the row reference. +@return TRUE if found */ +UNIV_INTERN +ibool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ulint low_match; + rec_t* rec; + dict_index_t* index; + + ut_ad(dtuple_check_typed(ref)); + + index = dict_table_get_first_index(table); + + ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index)); + + btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr); + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_infimum(rec)) { + + return(FALSE); + } + + if (low_match != dtuple_get_n_fields(ref)) { + + return(FALSE); + } + + return(TRUE); +} + +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +UNIV_INTERN +rec_t* +row_get_clust_rec( +/*==============*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* ref; + dict_table_t* table; + btr_pcur_t pcur; + ibool found; + rec_t* clust_rec; + + ut_ad(!dict_index_is_clust(index)); + + table = index->table; + + heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap); + + found = row_search_on_row_ref(&pcur, mode, table, ref, mtr); + + clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL; + + mem_heap_free(heap); + + btr_pcur_close(&pcur); + + *clust_index = dict_table_get_first_index(table); + + return(clust_rec); +} + +/***************************************************************//** +Searches an index record. +@return TRUE if found */ +UNIV_INTERN +ibool +row_search_index_entry( +/*===================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: index entry */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint n_fields; + ulint low_match; + rec_t* rec; + + ut_ad(dtuple_check_typed(entry)); + + btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr); + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + n_fields = dtuple_get_n_fields(entry); + + return(!page_rec_is_infimum(rec) && low_match == n_fields); +} + +#include <my_sys.h> + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_INT using "prtype" and writes the result to "buf". +If the data is in unknown format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). +@return number of bytes that were written */ +static +ulint +row_raw_format_int( +/*===============*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint prtype, /*!< in: precise type */ + char* buf, /*!< out: output buffer */ + ulint buf_size, /*!< in: output buffer size + in bytes */ + ibool* format_in_hex) /*!< out: should the data be + formated in hex */ +{ + ulint ret; + + if (data_len <= sizeof(ullint)) { + + ullint value; + ibool unsigned_type = prtype & DATA_UNSIGNED; + + value = mach_read_int_type((const byte*) data, + data_len, unsigned_type); + + if (unsigned_type) { + + ret = ut_snprintf(buf, buf_size, "%llu", + value) + 1; + } else { + + ret = ut_snprintf(buf, buf_size, "%lld", + (long long) value) + 1; + } + + } else { + + *format_in_hex = TRUE; + ret = 0; + } + + return(ut_min(ret, buf_size)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the +result to "buf". +If the data is in binary format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). +@return number of bytes that were written */ +static +ulint +row_raw_format_str( +/*===============*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint prtype, /*!< in: precise type */ + char* buf, /*!< out: output buffer */ + ulint buf_size, /*!< in: output buffer size + in bytes */ + ibool* format_in_hex) /*!< out: should the data be + formated in hex */ +{ + ulint charset_coll; + + if (buf_size == 0) { + + return(0); + } + + /* we assume system_charset_info is UTF-8 */ + + charset_coll = dtype_get_charset_coll(prtype); + + if (UNIV_LIKELY(dtype_is_utf8(prtype))) { + + return(ut_str_sql_format(data, data_len, buf, buf_size)); + } + /* else */ + + if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) { + + *format_in_hex = TRUE; + return(0); + } + /* else */ + + return(innobase_raw_format(data, data_len, charset_coll, + buf, buf_size)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint mtype; + ulint prtype; + ulint ret; + ibool format_in_hex; + + if (buf_size == 0) { + + return(0); + } + + if (data_len == UNIV_SQL_NULL) { + + ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1; + + return(ut_min(ret, buf_size)); + } + + mtype = dict_field->col->mtype; + prtype = dict_field->col->prtype; + + format_in_hex = FALSE; + + switch (mtype) { + case DATA_INT: + + ret = row_raw_format_int(data, data_len, prtype, + buf, buf_size, &format_in_hex); + break; + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + + ret = row_raw_format_str(data, data_len, prtype, + buf, buf_size, &format_in_hex); + break; + /* XXX support more data types */ + default: + + format_in_hex = TRUE; + } + + if (format_in_hex) { + + if (UNIV_LIKELY(buf_size > 2)) { + + memcpy(buf, "0x", 2); + buf += 2; + buf_size -= 2; + ret = 2 + ut_raw_to_hex(data, data_len, + buf, buf_size); + } else { + + buf[0] = '\0'; + ret = 1; + } + } + + return(ret); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include "ut0dbg.h" + +void +test_row_raw_format_int() +{ + ulint ret; + char buf[128]; + ibool format_in_hex; + +#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\ + ret_expected, buf_expected, format_in_hex_expected)\ + do {\ + ibool ok = TRUE;\ + ulint i;\ + memset(buf, 'x', 10);\ + buf[10] = '\0';\ + format_in_hex = FALSE;\ + fprintf(stderr, "TESTING \"\\x");\ + for (i = 0; i < data_len; i++) {\ + fprintf(stderr, "%02hhX", data[i]);\ + }\ + fprintf(stderr, "\", %lu, %lu, %lu\n",\ + (ulint) data_len, (ulint) prtype,\ + (ulint) buf_size);\ + ret = row_raw_format_int(data, data_len, prtype,\ + buf, buf_size, &format_in_hex);\ + if (ret != ret_expected) {\ + fprintf(stderr, "expected ret %lu, got %lu\n",\ + (ulint) ret_expected, ret);\ + ok = FALSE;\ + }\ + if (strcmp((char*) buf, buf_expected) != 0) {\ + fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\ + buf_expected, buf);\ + ok = FALSE;\ + }\ + if (format_in_hex != format_in_hex_expected) {\ + fprintf(stderr, "expected format_in_hex %d, got %d\n",\ + (int) format_in_hex_expected,\ + (int) format_in_hex);\ + ok = FALSE;\ + }\ + if (ok) {\ + fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\ + (ulint) ret, buf, (int) format_in_hex);\ + } else {\ + return;\ + }\ + } while (0) + +#if 1 + /* min values for signed 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, 0, + buf, sizeof(buf), 5, "-128", 0); + + CALL_AND_TEST("\x00\x00", 2, 0, + buf, sizeof(buf), 7, "-32768", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, 0, + buf, sizeof(buf), 9, "-8388608", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, 0, + buf, sizeof(buf), 12, "-2147483648", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0, + buf, sizeof(buf), 14, "-549755813888", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0, + buf, sizeof(buf), 17, "-140737488355328", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0, + buf, sizeof(buf), 19, "-36028797018963968", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0, + buf, sizeof(buf), 21, "-9223372036854775808", 0); + + /* min values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + /* max values for signed 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, 0, + buf, sizeof(buf), 4, "127", 0); + + CALL_AND_TEST("\xFF\xFF", 2, 0, + buf, sizeof(buf), 6, "32767", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, 0, + buf, sizeof(buf), 8, "8388607", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0, + buf, sizeof(buf), 11, "2147483647", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0, + buf, sizeof(buf), 13, "549755813887", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0, + buf, sizeof(buf), 16, "140737488355327", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0, + buf, sizeof(buf), 18, "36028797018963967", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0, + buf, sizeof(buf), 20, "9223372036854775807", 0); + + /* max values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED, + buf, sizeof(buf), 4, "255", 0); + + CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "65535", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED, + buf, sizeof(buf), 9, "16777215", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED, + buf, sizeof(buf), 11, "4294967295", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED, + buf, sizeof(buf), 14, "1099511627775", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED, + buf, sizeof(buf), 16, "281474976710655", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED, + buf, sizeof(buf), 18, "72057594037927935", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED, + buf, sizeof(buf), 21, "18446744073709551615", 0); + + /* some random values */ + + CALL_AND_TEST("\x52", 1, 0, + buf, sizeof(buf), 4, "-46", 0); + + CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED, + buf, sizeof(buf), 3, "14", 0); + + CALL_AND_TEST("\x62\xCE", 2, 0, + buf, sizeof(buf), 6, "-7474", 0); + + CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "10710", 0); + + CALL_AND_TEST("\x7F\xFF\x90", 3, 0, + buf, sizeof(buf), 5, "-112", 0); + + CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED, + buf, sizeof(buf), 6, "41238", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0, + buf, sizeof(buf), 3, "-9", 0); + + CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED, + buf, sizeof(buf), 3, "92", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0, + buf, sizeof(buf), 6, "-9117", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED, + buf, sizeof(buf), 6, "91234", 0); +#endif + + /* speed test */ + + speedo_t speedo; + ulint i; + + speedo_reset(&speedo); + + for (i = 0; i < 1000000; i++) { + row_raw_format_int("\x23", 1, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x23", 1, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + } + + speedo_show(&speedo); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.c new file mode 100644 index 00000000000..3ef9726588e --- /dev/null +++ b/storage/innobase/row/row0sel.c @@ -0,0 +1,4736 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/***************************************************//** +@file row/row0sel.c +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" + +#ifdef UNIV_NONINL +#include "row0sel.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" +#include "read0read.h" +#include "buf0lru.h" +#include "ha_prototypes.h" + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/********************************************************************//** +Returns TRUE if the user-defined column in a secondary index record +is alphabetically the same as the corresponding BLOB column in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! +@return TRUE if the columns are equal */ +static +ibool +row_sel_sec_rec_is_for_blob( +/*========================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint mbminlen, /*!< in: minimum length of a + multi-byte character */ + ulint mbmaxlen, /*!< in: maximum length of a + multi-byte character */ + const byte* clust_field, /*!< in: the locally stored part of + the clustered index column, including + the BLOB pointer; the clustered + index record must be covered by + a lock or a page latch to protect it + against deletion (rollback or purge) */ + ulint clust_len, /*!< in: length of clust_field */ + const byte* sec_field, /*!< in: column in secondary index */ + ulint sec_len, /*!< in: length of sec_field */ + ulint zip_size) /*!< in: compressed page size, or 0 */ +{ + ulint len; + byte buf[DICT_MAX_INDEX_COL_LEN]; + + len = btr_copy_externally_stored_field_prefix(buf, sizeof buf, + zip_size, + clust_field, clust_len); + + if (UNIV_UNLIKELY(len == 0)) { + /* The BLOB was being deleted as the server crashed. + There should not be any secondary index records + referring to this clustered index record, because + btr_free_externally_stored_field() is called after all + secondary index entries of the row have been purged. */ + return(FALSE); + } + + len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen, + sec_len, len, (const char*) buf); + + return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len)); +} + +/********************************************************************//** +Returns TRUE if the user-defined column values in a secondary index record +are alphabetically the same as the corresponding columns in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! +@return TRUE if the secondary record is equal to the corresponding +fields in the clustered record, when compared with collation */ +static +ibool +row_sel_sec_rec_is_for_clust_rec( +/*=============================*/ + const rec_t* sec_rec, /*!< in: secondary index record */ + dict_index_t* sec_index, /*!< in: secondary index */ + const rec_t* clust_rec, /*!< in: clustered index record; + must be protected by a lock or + a page latch against deletion + in rollback or purge */ + dict_index_t* clust_index) /*!< in: clustered index */ +{ + const byte* sec_field; + ulint sec_len; + const byte* clust_field; + ulint n; + ulint i; + mem_heap_t* heap = NULL; + ulint clust_offsets_[REC_OFFS_NORMAL_SIZE]; + ulint sec_offsets_[REC_OFFS_SMALL_SIZE]; + ulint* clust_offs = clust_offsets_; + ulint* sec_offs = sec_offsets_; + ibool is_equal = TRUE; + + rec_offs_init(clust_offsets_); + rec_offs_init(sec_offsets_); + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(clust_index->table))) { + + /* The clustered index record is delete-marked; + it is not visible in the read view. Besides, + if there are any externally stored columns, + some of them may have already been purged. */ + return(FALSE); + } + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + ULINT_UNDEFINED, &heap); + + n = dict_index_get_n_ordering_defined_by_user(sec_index); + + for (i = 0; i < n; i++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint clust_pos; + ulint clust_len; + ulint len; + + ifield = dict_index_get_nth_field(sec_index, i); + col = dict_field_get_col(ifield); + clust_pos = dict_col_get_clust_pos(col, clust_index); + + clust_field = rec_get_nth_field( + clust_rec, clust_offs, clust_pos, &clust_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); + + len = clust_len; + + if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) { + + if (rec_offs_nth_extern(clust_offs, clust_pos)) { + len -= BTR_EXTERN_FIELD_REF_SIZE; + } + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ifield->prefix_len, len, (char*) clust_field); + + if (rec_offs_nth_extern(clust_offs, clust_pos) + && len < sec_len) { + if (!row_sel_sec_rec_is_for_blob( + col->mtype, col->prtype, + col->mbminlen, col->mbmaxlen, + clust_field, clust_len, + sec_field, sec_len, + dict_table_zip_size( + clust_index->table))) { + goto inequal; + } + + continue; + } + } + + if (0 != cmp_data_data(col->mtype, col->prtype, + clust_field, len, + sec_field, sec_len)) { +inequal: + is_equal = FALSE; + goto func_exit; + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(is_equal); +} + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +UNIV_INTERN +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + sel_node_t* node; + + node = mem_heap_alloc(heap, sizeof(sel_node_t)); + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->plans = NULL; + + return(node); +} + +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +UNIV_INTERN +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /*!< in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/*********************************************************************//** +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /*!< in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/*********************************************************************//** +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /*!< in: first variable in a list of variables */ + sel_node_t* node) /*!< in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + exp = node->select_list; + + while (var) { + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + var = que_node_get_next(var); + } +} + +/*********************************************************************//** +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /*!< in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + func_node = node->select_list; + + while (func_node) { + eval_node_set_int_val(func_node, 0); + + func_node = que_node_get_next(func_node); + } + + node->aggregate_already_fetched = FALSE; +} + +/*********************************************************************//** +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /*!< in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/*********************************************************************//** +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /*!< in: record index */ + const rec_t* rec, /*!< in: record in a clustered or non-clustered + index; must be protected by a page latch */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + sym_node_t* column) /*!< in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + mem_heap_t* heap = NULL; + ibool needs_copy; + + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, + field_no))) { + + /* Copy an externally stored field to the + temporary heap */ + + heap = mem_heap_create(1); + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + dict_table_zip_size(index->table), + field_no, &len, heap); + + ut_a(len != UNIV_SQL_NULL); + + needs_copy = TRUE; + } else { + data = rec_get_nth_field(rec, offsets, + field_no, &len); + + if (len == UNIV_SQL_NULL) { + len = UNIV_SQL_NULL; + } + + needs_copy = column->copy_val; + } + + if (needs_copy) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /*!< in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH + * sizeof(sel_buf_t)); + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + + sel_buf->val_buf_size = 0; + } +} + +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +UNIV_INTERN +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + mem_free(sel_buf->data); + } + } +} + +/*********************************************************************//** +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_pop_prefetched_row( +/*===================*/ + plan_t* plan) /*!< in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); + ut_d(dfield_set_null(val)); + + goto next_col; + } + + ut_ad(column->prefetch_buf); + ut_ad(!dfield_is_ext(val)); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = dfield_get_data(val); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/*********************************************************************//** +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_push_prefetched_row( +/*====================*/ + plan_t* plan) /*!< in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + + goto next_col; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = dfield_get_data(val); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Builds a previous version of a clustered index record for a consistent read +@return DB_SUCCESS or error code */ +static +ulint +row_sel_build_prev_vers( +/*====================*/ + read_view_t* read_view, /*!< in: read view */ + dict_index_t* index, /*!< in: plan node for table */ + rec_t* rec, /*!< in: record in a clustered index */ + ulint** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, plan->index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t** old_vers_heap, /*!< out: old version heap to use */ + rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint err; + + if (*old_vers_heap) { + mem_heap_empty(*old_vers_heap); + } else { + *old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, index, offsets, read_view, offset_heap, + *old_vers_heap, old_vers); + return(err); +} + +/*********************************************************************//** +Builds the last committed version of a clustered index record for a +semi-consistent read. +@return DB_SUCCESS or error code */ +static +ulint +row_sel_build_committed_vers_for_mysql( +/*===================================*/ + dict_index_t* clust_index, /*!< in: clustered index */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: record in a clustered index */ + ulint** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + const rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_semi_consistent_read( + rec, mtr, clust_index, offsets, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/*********************************************************************//** +Tests the conditions which determine when the index segment we are searching +through has been exhausted. +@return TRUE if row passed the tests */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + plan_t* plan) /*!< in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(cond->args); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/*********************************************************************//** +Tests the other conditions. +@return TRUE if row passed the tests */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + plan_t* plan) /*!< in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/*********************************************************************//** +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. +@return DB_SUCCESS or error code */ +static +ulint +row_sel_get_clust_rec( +/*==================*/ + sel_node_t* node, /*!< in: select_node */ + plan_t* plan, /*!< in: plan node for table */ + rec_t* rec, /*!< in: record in a non-clustered index */ + que_thr_t* thr, /*!< in: query thread */ + rec_t** out_rec,/*!< out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /*!< in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + *out_rec = NULL; + + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + offsets, ULINT_UNDEFINED, &heap); + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); + + index = dict_table_get_first_index(plan->table); + + btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE, + BTR_SEARCH_LEAF, &plan->clust_pcur, + 0, mtr); + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&(plan->clust_pcur)) + < dict_index_get_n_unique(index)) { + + ut_a(rec_get_deleted_flag(rec, + dict_table_is_comp(plan->table))); + ut_a(node->read_view); + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + goto func_exit; + } + + offsets = rec_get_offsets(clust_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation level + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; + trx_t* trx; + + trx = thr_get_trx(thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(&plan->clust_pcur), + clust_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers( + node->read_view, index, clust_rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + goto func_exit; + } + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if ((old_vers + || rec_get_deleted_flag(rec, dict_table_is_comp( + plan->table))) + && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, + clust_rec, index)) { + goto func_exit; + } + } + + /* Fetch the columns needed in test conditions. The clustered + index record is protected by a page latch that was acquired + when plan->clust_pcur was positioned. The latch will not be + released until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, clust_rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + *out_rec = clust_rec; +func_exit: + err = DB_SUCCESS; +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/*********************************************************************//** +Sets a lock on a record. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ulint +sel_set_rec_lock( +/*=============*/ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint mode, /*!< in: lock mode */ + ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOC_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + ulint err; + + trx = thr_get_trx(thr); + + if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) { + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + } + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, mode, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, mode, type, thr); + } + + return(err); +} + +/*********************************************************************//** +Opens a pcur to a table index. */ +static +void +row_sel_open_pcur( +/*==============*/ + plan_t* plan, /*!< in: table plan */ + ibool search_latch_locked, + /*!< in: TRUE if the thread currently + has the search latch locked in + s-mode */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint has_search_latch = 0; /* RW_S_LATCH or 0 */ + ulint i; + + if (search_latch_locked) { + has_search_latch = RW_S_LATCH; + } + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + /* Open pcur to the index */ + + btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, + BTR_SEARCH_LEAF, &plan->pcur, + has_search_latch, mtr); + } else { + /* Open the cursor to the start or the end of the index + (FALSE: no init) */ + + btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF, + &(plan->pcur), FALSE, mtr); + } + + ut_ad(plan->n_rows_prefetched == 0); + ut_ad(plan->n_rows_fetched == 0); + ut_ad(plan->cursor_at_end == FALSE); + + plan->pcur_is_open = TRUE; +} + +/*********************************************************************//** +Restores a stored pcur position to a table index. +@return TRUE if the cursor should be moved to the next record after we +return from this function (moved to the previous, in the case of a +descending cursor) without processing again the current cursor +record */ +static +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + plan_t* plan, /*!< in: table plan */ + mtr_t* mtr) /*!< in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF, + &(plan->pcur), mtr); + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(TRUE); +} + +/*********************************************************************//** +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /*!< in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +/*********************************************************************//** +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). +@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + sel_node_t* node, /*!< in: select node for a consistent read */ + plan_t* plan, /*!< in: plan for a unique search in clustered + index */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + rec_offs_init(offsets_); + + index = plan->index; + + ut_ad(node->read_view); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + row_sel_open_pcur(plan, TRUE, mtr); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (dict_index_is_clust(index)) { + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + ret = SEL_RETRY; + goto func_exit; + } + } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) { + + ret = SEL_RETRY; + goto func_exit; + } + + /* Test the deleted flag. */ + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + /* Fetch the columns needed in test conditions. The index + record is protected by a page latch that was acquired when + plan->pcur was positioned. The latch will not be released + until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + plan->n_rows_fetched++; + ret = SEL_FOUND; +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ret); +} + +/*********************************************************************//** +Performs a select step. +@return DB_SUCCESS or error code */ +static +ulint +row_sel( +/*====*/ + sel_node_t* node, /*!< in: select node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + ibool search_latch_locked; + ibool consistent_read; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + ulint found_flag; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(thr->run_node == node); + + search_latch_locked = FALSE; + + if (node->read_view) { + /* In consistent reads, we try to do with the hash index and + not to use the buffer page get. This is to reduce memory bus + load resulting from semaphore operations. The search latch + will be s-locked when we access an index with a unique search + condition, but not locked when we access an index with a + less selective search condition. */ + + consistent_read = TRUE; + } else { + consistent_read = FALSE; + } + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr_start(&mtr); + + if (consistent_read && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust + && !plan->table->big_rows) { + if (!search_latch_locked) { + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; + } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often + kept for some 10 searches before being released, + a waiting x-latch request would block other threads + from acquiring an s-latch for a long time, lowering + performance significantly in multiprocessors. */ + + rw_lock_s_unlock(&btr_search_latch); + rw_lock_s_lock(&btr_search_latch); + } + + found_flag = row_sel_try_search_shortcut(node, plan, &mtr); + + if (found_flag == SEL_FOUND) { + + goto next_table; + + } else if (found_flag == SEL_EXHAUSTED) { + + goto table_exhausted; + } + + ut_ad(found_flag == SEL_RETRY); + + plan_reset_cursor(plan); + + mtr_commit(&mtr); + mtr_start(&mtr); + } + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + + search_latch_locked = FALSE; + } + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + + row_sel_open_pcur(plan, search_latch_locked, &mtr); + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && !page_rec_is_supremum(rec)) { + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!consistent_read) { + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation + level, we lock only the record, i.e., next-key + locking is not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + trx_t* trx; + + trx = thr_get_trx(thr); + + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) { + + if (page_rec_is_supremum(next_rec)) { + + goto skip_lock; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur), + next_rec, index, offsets, + node->row_lock_mode, + lock_type, thr); + + if (err != DB_SUCCESS) { + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + + goto lock_wait_or_error; + } + } + } + +skip_lock: + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (!consistent_read) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation level, + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + trx_t* trx; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + trx = thr_get_trx(thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) { + + if (page_rec_is_supremum(rec)) { + + goto next_rec; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur), + rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (page_rec_is_supremum(rec)) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (consistent_read) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (dict_index_is_clust(index)) { + + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers( + node->read_view, index, rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Fetch the columns needed in + test conditions. The clustered + index record is protected by a + page latch that was acquired + by row_sel_open_pcur() or + row_sel_restore_pcur_pos(). + The latch will not be released + until mtr_commit(mtr). */ + + row_sel_fetch_columns( + index, rec, offsets, + UT_LIST_GET_FIRST( + plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, + node->read_view)) { + cons_read_requires_clust_rec = TRUE; + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions. The record is + protected by a page latch that was acquired by + row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch + will not be released until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table)) + && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(consistent_read); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(plan->table))) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch + || plan->table->big_rows) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_push_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_pop_prefetched_row(plan); + + goto next_table; + } + +next_rec: + ut_ad(!search_latch_locked); + + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&plan->pcur)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + err = DB_SUCCESS; + goto func_exit; + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + err = DB_SUCCESS; + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + } else { + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + } + + goto func_exit; + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + err = DB_SUCCESS; + goto func_exit; + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + ut_ad(!search_latch_locked); + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc); + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + +func_exit: + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint i_lock_mode; + sym_node_t* table_node; + sel_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(thr_get_trx(thr)); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + /* Assign a read view for the query */ + node->read_view = trx_assign_read_view( + thr_get_trx(thr)); + } else { + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + table_node = node->table_list; + + while (table_node) { + err = lock_table(0, table_node->table, + i_lock_mode, thr); + if (err != DB_SUCCESS) { + thr_get_trx(thr)->error_state = err; + + return(NULL); + } + + table_node = que_node_get_next(table_node); + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor + && UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err != DB_SUCCESS) { + thr_get_trx(thr)->error_state = err; + + return(NULL); + } + + return(thr); +} + +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + if (node->into_list) { + sel_assign_into_var_values(node->into_list, + sel_node); + } else { + void* ret = (*node->func->func)( + sel_node, node->func->arg); + + if (!ret) { + sel_node->state + = SEL_NODE_NO_MORE_ROWS; + } + } + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + fprintf(stderr, + "InnoDB: Error: fetch called on a closed cursor\n"); + + thr_get_trx(thr)->error_state = DB_ERROR; + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/****************************************************************//** +Sample callback function for fetch that prints each row. +@return always returns non-NULL */ +UNIV_INTERN +void* +row_fetch_print( +/*============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: not used */ +{ + sel_node_t* node = row; + que_node_t* exp; + ulint i = 0; + + UT_NOT_USED(user_arg); + + fprintf(stderr, "row_fetch_print: row %p\n", row); + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + const dtype_t* type = dfield_get_type(dfield); + + fprintf(stderr, " column %lu:\n", (ulong)i); + + dtype_print(type); + putc('\n', stderr); + + if (dfield_get_len(dfield) != UNIV_SQL_NULL) { + ut_print_buf(stderr, dfield_get_data(dfield), + dfield_get_len(dfield)); + putc('\n', stderr); + } else { + fputs(" <NULL>;\n", stderr); + } + + exp = que_node_get_next(exp); + i++; + } + + return((void*)42); +} + +/****************************************************************//** +Callback function for fetch that stores an unsigned 4 byte integer to the +location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length += 4. +@return always returns NULL */ +UNIV_INTERN +void* +row_fetch_store_uint4( +/*==================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: data pointer */ +{ + sel_node_t* node = row; + ib_uint32_t* val = user_arg; + ulint tmp; + + dfield_t* dfield = que_node_get_val(node->select_list); + const dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(dtype_get_prtype(type) & DATA_UNSIGNED); + ut_a(len == 4); + + tmp = mach_read_from_4(dfield_get_data(dfield)); + *val = (ib_uint32_t) tmp; + + return(NULL); +} + +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = thr->run_node; + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + fputs(" ::: ", stderr); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. A counterpart of this function is +ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ +UNIV_INTERN +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len, /*!< in: MySQL key value length */ + trx_t* trx) /*!< in: transaction */ +{ + byte* original_buf = buf; + const byte* original_key_ptr = key_ptr; + dict_field_t* field; + dfield_t* dfield; + ulint data_offset; + ulint data_len; + ulint data_field_len; + ibool is_null; + const byte* key_end; + ulint n_fields = 0; + + /* For documentation of the key value storage format in MySQL, see + ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + field = dict_index_get_nth_field(index, 0); + + if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) { + /* A special case: we are looking for a position in the + generated clustered index which InnoDB automatically added + to a table with no primary key: the first and the only + ordering column is ROW_ID which InnoDB stored to the key_ptr + buffer. */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + + ulint type = dfield_get_type(dfield)->mtype; + ut_a(field->col->mtype == type); + + data_offset = 0; + is_null = FALSE; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + data_offset = 1; + + if (*key_ptr != 0) { + dfield_set_null(dfield); + + is_null = TRUE; + } + } + + /* Calculate data length and data field total length */ + + if (type == DATA_BLOB) { + /* The key field is a column prefix of a BLOB or + TEXT */ + + ut_a(field->prefix_len > 0); + + /* MySQL stores the actual data length to the first 2 + bytes after the optional SQL NULL marker byte. The + storage format is little-endian, that is, the most + significant byte at a higher address. In UTF-8, MySQL + seems to reserve field->prefix_len bytes for + storing this field in the key value buffer, even + though the actual value only takes data_len bytes + from the start. */ + + data_len = key_ptr[data_offset] + + 256 * key_ptr[data_offset + 1]; + data_field_len = data_offset + 2 + field->prefix_len; + + data_offset += 2; + + /* Now that we know the length, we store the column + value like it would be a fixed char field */ + + } else if (field->prefix_len > 0) { + /* Looks like MySQL pads unused end bytes in the + prefix with space. Therefore, also in UTF-8, it is ok + to compare with a prefix containing full prefix_len + bytes, and no need to take at most prefix_len / 3 + UTF-8 characters from the start. + If the prefix is used as the upper end of a LIKE + 'abc%' query, then MySQL pads the end with chars + 0xff. TODO: in that case does it any harm to compare + with the full prefix_len bytes. How do characters + 0xff in UTF-8 behave? */ + + data_len = field->prefix_len; + data_field_len = data_offset + data_len; + } else { + data_len = dfield_get_type(dfield)->len; + data_field_len = data_offset + data_len; + } + + if (UNIV_UNLIKELY + (dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR) + && UNIV_LIKELY(type != DATA_INT)) { + /* In a MySQL key value format, a true VARCHAR is + always preceded by 2 bytes of a length field. + dfield_get_type(dfield)->len returns the maximum + 'payload' len in bytes. That does not include the + 2 bytes that tell the actual data length. + + We added the check != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! */ + + data_len += 2; + data_field_len += 2; + } + + /* Storing may use at most data_len bytes of buf */ + + if (UNIV_LIKELY(!is_null)) { + row_mysql_store_col_in_innobase_format( + dfield, buf, + FALSE, /* MySQL key value format col */ + key_ptr + data_offset, data_len, + dict_table_is_comp(index->table)); + buf += data_len; + } + + key_ptr += data_field_len; + + if (UNIV_UNLIKELY(key_ptr > key_end)) { + /* The last field in key was not a complete key field + but a prefix of it. + + Print a warning about this! HA_READ_PREFIX_LAST does + not currently work in InnoDB with partial-field key + value prefixes. Since MySQL currently uses a padding + trick to calculate LIKE 'abc%' type queries there + should never be partial-field prefixes in searches. */ + + ut_print_timestamp(stderr); + + fputs(" InnoDB: Warning: using a partial-field" + " key prefix in search.\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, ". Last data field length %lu bytes,\n" + "InnoDB: key ptr now exceeds" + " key end by %lu bytes.\n" + "InnoDB: Key value in the MySQL format:\n", + (ulong) data_field_len, + (ulong) (key_ptr - key_end)); + fflush(stderr); + ut_print_buf(stderr, original_key_ptr, key_len); + putc('\n', stderr); + + if (!is_null) { + ulint len = dfield_get_len(dfield); + dfield_set_len(dfield, len + - (ulint) (key_ptr - key_end)); + } + } + + n_fields++; + field++; + dfield++; + } + + ut_a(buf <= original_buf + buf_len); + + /* We set the length of tuple to n_fields: we assume that the memory + area allocated for it is big enough (usually bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/**************************************************************//** +Stores the row id to the prebuilt struct. */ +static +void +row_sel_store_row_id_to_prebuilt( +/*=============================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */ + const rec_t* index_rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index of the record */ + const ulint* offsets) /*!< in: rec_get_offsets + (index_rec, index) */ +{ + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field( + index_rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); + + if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) { + fprintf(stderr, + "InnoDB: Error: Row id field is" + " wrong length %lu in ", (ulong) len); + dict_index_name_print(stderr, prebuilt->trx, index); + fprintf(stderr, "\n" + "InnoDB: Field number %lu, record:\n", + (ulong) dict_index_get_sys_col_pos(index, + DATA_ROW_ID)); + rec_print_new(stderr, index_rec, offsets); + putc('\n', stderr); + ut_error; + } + + ut_memcpy(prebuilt->row_id, data, len); +} + +/**************************************************************//** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */ +static +void +row_sel_field_store_in_mysql_format( +/*================================*/ + byte* dest, /*!< in/out: buffer where to store; NOTE + that BLOBs are not in themselves + stored here: the caller must allocate + and copy the BLOB into buffer before, + and pass the pointer to the BLOB in + 'data' */ + const mysql_row_templ_t* templ, + /*!< in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, + mbminlen, mbmaxlen */ + const byte* data, /*!< in: data to store */ + ulint len) /*!< in: length of the data */ +{ + byte* ptr; + byte* field_end; + byte* pad_ptr; + + ut_ad(len != UNIV_SQL_NULL); + + switch (templ->type) { + case DATA_INT: + /* Convert integer data from Innobase to a little-endian + format, sign bit restored to normal */ + + ptr = dest + len; + + for (;;) { + ptr--; + *ptr = *data; + if (ptr == dest) { + break; + } + data++; + } + + if (!templ->is_unsigned) { + dest[len - 1] = (byte) (dest[len - 1] ^ 128); + } + + ut_ad(templ->mysql_col_len == len); + break; + + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field_end = dest + templ->mysql_col_len; + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, templ->mysql_length_bytes); + } + + /* Copy the actual data */ + ut_memcpy(dest, data, len); + + /* Pad with trailing spaces. We pad with spaces also the + unused end of a >= 5.0.3 true VARCHAR column, just in case + MySQL expects its contents to be deterministic. */ + + pad_ptr = dest + len; + + ut_ad(templ->mbminlen <= templ->mbmaxlen); + + /* We handle UCS2 charset strings differently. */ + if (templ->mbminlen == 2) { + /* A space char is two bytes, 0x0020 in UCS2 */ + + if (len & 1) { + /* A 0x20 has been stripped from the column. + Pad it back. */ + + if (pad_ptr < field_end) { + *pad_ptr = 0x20; + pad_ptr++; + } + } + + /* Pad the rest of the string with 0x0020 */ + + while (pad_ptr < field_end) { + *pad_ptr = 0x00; + pad_ptr++; + *pad_ptr = 0x20; + pad_ptr++; + } + } else { + ut_ad(templ->mbminlen == 1); + /* space=0x20 */ + + memset(pad_ptr, 0x20, field_end - pad_ptr); + } + break; + + case DATA_BLOB: + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, templ->mysql_col_len, data, + len); + break; + + case DATA_MYSQL: + memcpy(dest, data, len); + + ut_ad(templ->mysql_col_len >= len); + ut_ad(templ->mbmaxlen >= templ->mbminlen); + + ut_ad(templ->mbmaxlen > templ->mbminlen + || templ->mysql_col_len == len); + /* The following assertion would fail for old tables + containing UTF-8 ENUM columns due to Bug #9526. */ + ut_ad(!templ->mbmaxlen + || !(templ->mysql_col_len % templ->mbmaxlen)); + ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len); + + if (templ->mbminlen != templ->mbmaxlen) { + /* Pad with spaces. This undoes the stripping + done in row0mysql.ic, function + row_mysql_store_col_in_innobase_format(). */ + + memset(dest + len, 0x20, templ->mysql_col_len - len); + } + break; + + default: +#ifdef UNIV_DEBUG + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Above are the valid column types for MySQL data. */ +#endif /* UNIV_DEBUG */ + ut_ad(templ->mysql_col_len == len); + memcpy(dest, data, len); + } +} + +/**************************************************************//** +Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. +@return TRUE if success, FALSE if could not allocate memory for a BLOB +(though we may also assert in that case) */ +static +ibool +row_sel_store_mysql_rec( +/*====================*/ + byte* mysql_rec, /*!< out: row in the MySQL format */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: Innobase record in the index + which was described in prebuilt's + template; must be protected by + a page latch */ + const ulint* offsets) /*!< in: array returned by + rec_get_offsets() */ +{ + mysql_row_templ_t* templ; + mem_heap_t* extern_field_heap = NULL; + mem_heap_t* heap; + const byte* data; + ulint len; + ulint i; + + ut_ad(prebuilt->mysql_template); + ut_ad(prebuilt->default_rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + } + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, + templ->rec_field_no))) { + + /* Copy an externally stored field to the temporary + heap */ + + ut_a(!prebuilt->trx->has_search_latch); + + if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) { + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + UNIV_PAGE_SIZE); + } + + heap = prebuilt->blob_heap; + } else { + extern_field_heap + = mem_heap_create(UNIV_PAGE_SIZE); + + heap = extern_field_heap; + } + + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + dict_table_zip_size(prebuilt->table), + templ->rec_field_no, &len, heap); + + ut_a(len != UNIV_SQL_NULL); + } else { + /* Field is stored in the row. */ + + data = rec_get_nth_field(rec, offsets, + templ->rec_field_no, &len); + + if (UNIV_UNLIKELY(templ->type == DATA_BLOB) + && len != UNIV_SQL_NULL) { + + /* It is a BLOB field locally stored in the + InnoDB record: we MUST copy its contents to + prebuilt->blob_heap here because later code + assumes all BLOB values have been copied to a + safe place. */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + UNIV_PAGE_SIZE); + } + + data = memcpy(mem_heap_alloc( + prebuilt->blob_heap, len), + data, len); + } + } + + if (len != UNIV_SQL_NULL) { + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, data, len); + + /* Cleanup */ + if (extern_field_heap) { + mem_heap_free(extern_field_heap); + extern_field_heap = NULL; + } + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ + mysql_rec[templ->mysql_null_byte_offset] + &= ~(byte) templ->mysql_null_bit_mask; + } + } else { + /* MySQL assumes that the field for an SQL + NULL value is set to the default value. */ + + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; + memcpy(mysql_rec + templ->mysql_col_offset, + (const byte*) prebuilt->default_rec + + templ->mysql_col_offset, + templ->mysql_col_len); + } + } + + return(TRUE); +} + +/*********************************************************************//** +Builds a previous version of a clustered index record for a consistent read +@return DB_SUCCESS or error code */ +static +ulint +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + read_view_t* read_view, /*!< in: read view */ + dict_index_t* clust_index, /*!< in: clustered index */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: record in a clustered index */ + ulint** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, clust_index, offsets, read_view, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/*********************************************************************//** +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. +@return DB_SUCCESS or error code */ +static +ulint +row_sel_get_clust_rec_for_mysql( +/*============================*/ + row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */ + dict_index_t* sec_index,/*!< in: secondary index where rec resides */ + const rec_t* rec, /*!< in: record in a non-clustered index; if + this is a locking read, then rec is not + allowed to be delete-marked, and that would + not make sense either */ + que_thr_t* thr, /*!< in: query thread */ + const rec_t** out_rec,/*!< out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + ulint** offsets,/*!< in: offsets returned by + rec_get_offsets(rec, sec_index); + out: offsets returned by + rec_get_offsets(out_rec, clust_index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mtr_t* mtr) /*!< in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + const rec_t* clust_rec; + rec_t* old_vers; + ulint err; + trx_t* trx; + + *out_rec = NULL; + trx = thr_get_trx(thr); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, + sec_index, *offsets, trx); + + clust_index = dict_table_get_first_index(sec_index->table); + + btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + + prebuilt->clust_pcur->trx_if_known = trx; + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(prebuilt->clust_pcur) + < dict_index_get_n_unique(clust_index)) { + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + if (!rec_get_deleted_flag(rec, + dict_table_is_comp(sec_index->table)) + || prebuilt->select_lock_type != LOCK_NONE) { + ut_print_timestamp(stderr); + fputs(" InnoDB: error clustered record" + " for sec rec not found\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, sec_index); + fputs("\n" + "InnoDB: sec index record ", stderr); + rec_print(stderr, rec, sec_index); + fputs("\n" + "InnoDB: clust index record ", stderr); + rec_print(stderr, clust_rec, clust_index); + putc('\n', stderr); + trx_print(stderr, trx, 600); + + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } + + clust_rec = NULL; + + goto func_exit; + } + + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + ULINT_UNDEFINED, offset_heap); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; we are searching + the clust rec with a unique condition, hence + we set a LOCK_REC_NOT_GAP type lock */ + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(prebuilt->clust_pcur), + clust_rec, clust_index, *offsets, + prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr); + if (err != DB_SUCCESS) { + + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + /* If the isolation level allows reading of uncommitted data, + then we never look for an earlier version */ + + if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && !lock_clust_rec_cons_read_sees( + clust_rec, clust_index, *offsets, + trx->read_view)) { + + /* The following call returns 'offsets' associated with + 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, prebuilt, + clust_rec, offsets, offset_heap, &old_vers, + mtr); + + if (err != DB_SUCCESS || old_vers == NULL) { + + goto err_exit; + } + + clust_rec = old_vers; + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if (clust_rec + && (old_vers + || rec_get_deleted_flag(rec, dict_table_is_comp( + sec_index->table))) + && !row_sel_sec_rec_is_for_clust_rec( + rec, sec_index, clust_rec, clust_index)) { + clust_rec = NULL; +#ifdef UNIV_SEARCH_DEBUG + } else { + ut_a(clust_rec == NULL + || row_sel_sec_rec_is_for_clust_rec( + rec, sec_index, clust_rec, clust_index)); +#endif + } + } + +func_exit: + *out_rec = clust_rec; + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* We may use the cursor in update or in unlock_row(): + store its position */ + + btr_pcur_store_position(prebuilt->clust_pcur, mtr); + } + + err = DB_SUCCESS; +err_exit: + return(err); +} + +/********************************************************************//** +Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on may have been deleted. +Then we may have to move the cursor one step up or down. +@return TRUE if we may need to process the record the cursor is now +positioned on (i.e. we should not go to the next record yet) */ +static +ibool +sel_restore_position_for_mysql( +/*===========================*/ + ibool* same_user_rec, /*!< out: TRUE if we were able to restore + the cursor on a user record with the + same ordering prefix in in the + B-tree index */ + ulint latch_mode, /*!< in: latch mode wished in + restoration */ + btr_pcur_t* pcur, /*!< in: cursor whose position + has been stored */ + ibool moves_up, /*!< in: TRUE if the cursor moves up + in the index */ + mtr_t* mtr) /*!< in: mtr; CAUTION: may commit + mtr temporarily! */ +{ + ibool success; + ulint relative_position; + + relative_position = pcur->rel_pos; + + success = btr_pcur_restore_position(latch_mode, pcur, mtr); + + *same_user_rec = success; + + if (relative_position == BTR_PCUR_ON) { + if (success) { + return(FALSE); + } + + if (moves_up) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); + } + + if (relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) { + + if (moves_up) { + return(TRUE); + } + + if (btr_pcur_is_on_user_rec(pcur)) { + btr_pcur_move_to_prev(pcur, mtr); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE); + + if (moves_up && btr_pcur_is_on_user_rec(pcur)) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); +} + +/********************************************************************//** +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_pop_cached_row_for_mysql( +/*=============================*/ + byte* buf, /*!< in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */ +{ + ulint i; + mysql_row_templ_t* templ; + byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len); + + if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) { + /* Copy cache record field by field, don't touch fields that + are not covered by current key */ + cached_rec = prebuilt->fetch_cache[ + prebuilt->fetch_cache_first]; + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + ut_memcpy(buf + templ->mysql_col_offset, + cached_rec + templ->mysql_col_offset, + templ->mysql_col_len); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) { + buf[templ->mysql_null_byte_offset] + ^= (buf[templ->mysql_null_byte_offset] + ^ cached_rec[templ->mysql_null_byte_offset]) + & (byte)templ->mysql_null_bit_mask; + } + } + } + else { + ut_memcpy(buf, + prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_prefix_len); + } + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/********************************************************************//** +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_push_cache_row_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: record to push; must + be protected by a page latch */ + const ulint* offsets) /*!< in: rec_get_offsets() */ +{ + byte* buf; + ulint i; + + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_a(!prebuilt->templ_contains_blob); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + + /* A user has reported memory corruption in these + buffers in Linux. Put magic numbers there to help + to track a possible bug. */ + + buf = mem_alloc(prebuilt->mysql_row_len + 8); + + prebuilt->fetch_cache[i] = buf + 4; + + mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N); + mach_write_to_4(buf + 4 + prebuilt->mysql_row_len, + ROW_PREBUILT_FETCH_MAGIC_N); + } + } + + ut_ad(prebuilt->fetch_cache_first == 0); + + if (UNIV_UNLIKELY(!row_sel_store_mysql_rec( + prebuilt->fetch_cache[ + prebuilt->n_fetch_cached], + prebuilt, rec, offsets))) { + ut_error; + } + + prebuilt->n_fetch_cached++; +} + +/*********************************************************************//** +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). We assume that the search +mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx, +btr search latch has been locked in S-mode. +@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ +static +ulint +row_sel_try_search_shortcut_for_mysql( +/*==================================*/ + const rec_t** out_rec,/*!< out: record if found */ + row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */ + ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + mtr_t* mtr) /*!< in: started mtr */ +{ + dict_index_t* index = prebuilt->index; + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + const rec_t* rec; + + ut_ad(dict_index_is_clust(index)); + ut_ad(!prebuilt->templ_contains_blob); + + btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, pcur, +#ifndef UNIV_SEARCH_DEBUG + RW_S_LATCH, +#else + 0, +#endif + mtr); + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { + + return(SEL_RETRY); + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) { + + return(SEL_EXHAUSTED); + } + + *out_rec = rec; + + return(SEL_FOUND); +} + +/********************************************************************//** +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ +UNIV_INTERN +ulint +row_search_for_mysql( +/*=================*/ + byte* buf, /*!< in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /*!< in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /*!< in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +{ + dict_index_t* index = prebuilt->index; + ibool comp = dict_table_is_comp(index->table); + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + const rec_t* rec; + const rec_t* result_rec; + const rec_t* clust_rec; + ulint err = DB_SUCCESS; + ibool unique_search = FALSE; + ibool unique_search_from_clust_index = FALSE; + ibool mtr_has_extra_clust_latch = FALSE; + ibool moves_up = FALSE; + ibool set_also_gap_locks = TRUE; + /* if the query is a plain locking SELECT, and the isolation level + is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ + ibool did_semi_consistent_read = FALSE; + /* if the returned record was locked and we did a semi-consistent + read (fetch the newest committed version), then this is set to + TRUE */ +#ifdef UNIV_SEARCH_DEBUG + ulint cnt = 0; +#endif /* UNIV_SEARCH_DEBUG */ + ulint next_offs; + ibool same_user_rec; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + ut_ad(index && pcur && search_tuple); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you used" + " DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + + return(DB_MISSING_HISTORY); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + +#if 0 + /* August 19, 2005 by Heikki: temporarily disable this error + print until the cursor lock count is done correctly. + See bugs #12263 and #12456!*/ + + if (trx->n_mysql_tables_in_use == 0 + && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) { + /* Note that if MySQL uses an InnoDB temp table that it + created inside LOCK TABLES, then n_mysql_tables_in_use can + be zero; in that case select_lock_type is set to LOCK_X in + ::start_stmt. */ + + fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n" + "InnoDB: but it has not locked" + " any tables in ::external_lock()!\n", + stderr); + trx_print(stderr, trx, 600); + fputc('\n', stderr); + } +#endif + +#if 0 + fprintf(stderr, "Match mode %lu\n search tuple ", + (ulong) match_mode); + dtuple_print(search_tuple); + fprintf(stderr, "N tables locked %lu\n", + (ulong) trx->mysql_n_tables_locked); +#endif + /*-------------------------------------------------------------*/ + /* PHASE 0: Release a possible s-latch we are holding on the + adaptive hash index latch if there is someone waiting behind */ + + if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED) + && trx->has_search_latch) { + + /* There is an x-latch request on the adaptive hash index: + release the s-latch to reduce starvation and wait for + BTR_SEA_TIMEOUT rounds before trying to keep it again over + calls from MySQL */ + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + } + + /* Reset the new record lock info if srv_locks_unsafe_for_binlog + is set or session is using a READ COMMITED isolation level. Then + we are able to remove the record locks set here on an individual + row. */ + prebuilt->new_rec_locks = 0; + + /*-------------------------------------------------------------*/ + /* PHASE 1: Try to pop the row from the prefetch cache */ + + if (UNIV_UNLIKELY(direction == 0)) { + trx->op_info = "starting index read"; + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + trx->op_info = "fetching rows"; + + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) { + if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) { + ut_error; + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + + srv_n_rows_read++; + err = DB_SUCCESS; + goto func_exit; + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { + + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + /* In a search where at most one record in the index may match, we + can use a LOCK_REC_NOT_GAP type record lock when locking a + non-delete-marked matching record. + + Note that in a unique secondary index there may be different + delete-marked versions of a record where only the primary key + values differ: thus in a secondary index we must use next-key + locks when locking delete-marked records. */ + + if (match_mode == ROW_SEL_EXACT + && dict_index_is_unique(index) + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index) + && (dict_index_is_clust(index) + || !dtuple_contains_null(search_tuple))) { + + /* Note above that a UNIQUE secondary index can contain many + rows with the same key value if one of the columns is the SQL + null. A clustered index under MySQL can never contain null + columns because we demand that all the columns in primary key + are non-null. */ + + unique_search = TRUE; + + /* Even if the condition is unique, MySQL seems to try to + retrieve also a second row if a primary key contains more than + 1 column. Return immediately if this is not a HANDLER + command. */ + + if (UNIV_UNLIKELY(direction != 0 + && !prebuilt->used_in_HANDLER)) { + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + } + + mtr_start(&mtr); + + /*-------------------------------------------------------------*/ + /* PHASE 2: Try fast adaptive hash index search if possible */ + + /* Next test if this is the special case where we can use the fast + adaptive hash index to try the search. Since we must release the + search system latch when we retrieve an externally stored field, we + cannot use the adaptive hash index in a search in the case the row + may be long and there may be externally stored fields */ + + if (UNIV_UNLIKELY(direction == 0) + && unique_search + && dict_index_is_clust(index) + && !prebuilt->templ_contains_blob + && !prebuilt->used_in_HANDLER + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { + + mode = PAGE_CUR_GE; + + unique_search_from_clust_index = TRUE; + + if (trx->mysql_n_tables_locked == 0 + && prebuilt->select_lock_type == LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && trx->read_view) { + + /* This is a SELECT query done as a consistent read, + and the read view has already been allocated: + let us try a search shortcut through the hash + index. + NOTE that we must also test that + mysql_n_tables_locked == 0, because this might + also be INSERT INTO ... SELECT ... or + CREATE TABLE ... SELECT ... . Our algorithm is + NOT prepared to inserts interleaved with the SELECT, + and if we try that, we can deadlock on the adaptive + hash index semaphore! */ + +#ifndef UNIV_SEARCH_DEBUG + if (!trx->has_search_latch) { + rw_lock_s_lock(&btr_search_latch); + trx->has_search_latch = TRUE; + } +#endif + switch (row_sel_try_search_shortcut_for_mysql( + &rec, prebuilt, &offsets, &heap, + &mtr)) { + case SEL_FOUND: +#ifdef UNIV_SEARCH_DEBUG + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); +#endif + /* At this point, rec is protected by + a page latch that was acquired by + row_sel_try_search_shortcut_for_mysql(). + The latch will not be released until + mtr_commit(&mtr). */ + + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { + err = DB_TOO_BIG_RECORD; + + /* We let the main loop to do the + error handling */ + goto shortcut_fails_too_big_rec; + } + + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" shortcut\n", stderr); */ + + srv_n_rows_read++; + + err = DB_SUCCESS; + goto release_search_latch_if_needed; + + case SEL_EXHAUSTED: + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" record not found 2\n", stderr); */ + + err = DB_RECORD_NOT_FOUND; +release_search_latch_if_needed: + if (trx->search_latch_timeout > 0 + && trx->has_search_latch) { + + trx->search_latch_timeout--; + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + /* NOTE that we do NOT store the cursor + position */ + goto func_exit; + + case SEL_RETRY: + break; + + default: + ut_ad(0); + } +shortcut_fails_too_big_rec: + mtr_commit(&mtr); + mtr_start(&mtr); + } + } + + /*-------------------------------------------------------------*/ + /* PHASE 3: Open or restore index cursor position */ + + if (trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + trx_start_if_not_started(trx); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE + && trx->mysql_thd != NULL + && thd_is_select(trx->mysql_thd)) { + /* It is a plain locking SELECT and the isolation + level is low: do not lock gaps */ + + set_also_gap_locks = FALSE; + } + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (UNIV_UNLIKELY(direction == 0)) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { + moves_up = TRUE; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = TRUE; + } + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + clust_index = dict_table_get_first_index(index->table); + + if (UNIV_LIKELY(direction != 0)) { + ibool need_to_process = sel_restore_position_for_mysql( + &same_user_rec, BTR_SEARCH_LEAF, + pcur, moves_up, &mtr); + + if (UNIV_UNLIKELY(need_to_process)) { + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + /* We did a semi-consistent read, + but the record was removed in + the meantime. */ + prebuilt->row_read_type + = ROW_READ_TRY_SEMI_CONSISTENT; + } + } else if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_DID_SEMI_CONSISTENT)) { + + /* The cursor was positioned on the record + that we returned previously. If we need + to repeat a semi-consistent read as a + pessimistic locking read, the record + cannot be skipped. */ + + goto next_rec; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + + btr_pcur_open_with_no_init(index, search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, 0, &mtr); + + pcur->trx_if_known = trx; + + rec = btr_pcur_get_rec(pcur); + + if (!moves_up + && !page_rec_is_supremum(rec) + && set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the next index record + to prevent phantoms in ORDER BY ... DESC queries */ + const rec_t* next = page_rec_get_next_const(rec); + + offsets = rec_get_offsets(next, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + next, index, offsets, + prebuilt->select_lock_type, + LOCK_GAP, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + } else { + if (mode == PAGE_CUR_G) { + btr_pcur_open_at_index_side( + TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE, + &mtr); + } else if (mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side( + FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE, + &mtr); + } + } + + if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + + if (trx->read_view == NULL + && prebuilt->select_lock_type == LOCK_NONE) { + + fputs("InnoDB: Error: MySQL is trying to" + " perform a consistent read\n" + "InnoDB: but the read view is not assigned!\n", + stderr); + trx_print(stderr, trx, 600); + fputc('\n', stderr); + ut_a(0); + } + } else if (prebuilt->select_lock_type == LOCK_NONE) { + /* This is a consistent read */ + /* Assign a read view for the query */ + + trx_assign_read_view(trx); + prebuilt->sql_stat_start = FALSE; + } else { + ulint lock_mode; + if (prebuilt->select_lock_type == LOCK_S) { + lock_mode = LOCK_IS; + } else { + lock_mode = LOCK_IX; + } + err = lock_table(0, index->table, lock_mode, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + prebuilt->sql_stat_start = FALSE; + } + +rec_loop: + /*-------------------------------------------------------------*/ + /* PHASE 4: Look for matching records in a loop */ + + rec = btr_pcur_get_rec(pcur); + ut_ad(!!page_rec_is_comp(rec) == comp); +#ifdef UNIV_SEARCH_DEBUG + /* + fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt, + page_get_page_no(page_align(rec))); + rec_print(rec); + */ +#endif /* UNIV_SEARCH_DEBUG */ + + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (page_rec_is_supremum(rec)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using a READ COMMITTED isolation + level we do not lock gaps. Supremum record is really + a gap and therefore we do not set locks there. */ + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + /* A page supremum record cannot be in the result set: skip + it now that we have placed a possible lock on it */ + + goto next_rec; + } + + /*-------------------------------------------------------------*/ + /* Do sanity checks in case our cursor has bumped into page + corruption */ + + if (comp) { + next_offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) { + + goto wrong_offs; + } + } else { + next_offs = rec_get_next_offs(rec, FALSE); + if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) { + + goto wrong_offs; + } + } + + if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) { + +wrong_offs: + if (srv_force_recovery == 0 || moves_up == FALSE) { + ut_print_timestamp(stderr); + buf_page_print(page_align(rec), 0); + fprintf(stderr, + "\nInnoDB: rec address %p," + " buf block fix count %lu\n", + (void*) rec, (ulong) + btr_cur_get_block(btr_pcur_get_btr_cur(pcur)) + ->page.buf_fix_count); + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". Run CHECK TABLE. You may need to\n" + "InnoDB: restore from a backup, or" + " dump + drop + reimport the table.\n", + stderr); + + err = DB_CORRUPTION; + + goto lock_wait_or_error; + } else { + /* The user may be dumping a corrupt table. Jump + over the corruption to recover as much as possible. */ + + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the rest of the page.\n", + stderr); + + btr_pcur_move_to_last_on_page(pcur, &mtr); + + goto next_rec; + } + } + /*-------------------------------------------------------------*/ + + /* Calculate the 'offsets' associated with 'rec' */ + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(srv_force_recovery > 0)) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the record.\n", + stderr); + + goto next_rec; + } + } + + /* Note that we cannot trust the up_match value in the cursor at this + place because we can arrive here after moving the cursor! Thus + we have to recompare rec and search_tuple to determine if they + match enough. */ + + if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + /* fputs("Comparing rec and search tuple\n", stderr); */ + + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set or this session is not + using a READ COMMITTED isolation level. */ + + err = sel_set_rec_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + err = DB_RECORD_NOT_FOUND; + /* ut_print_name(stderr, index->name); + fputs(" record not found 3\n", stderr); */ + + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set or this session is not + using a READ COMMITTED isolation level. */ + + err = sel_set_rec_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + err = DB_RECORD_NOT_FOUND; + /* ut_print_name(stderr, index->name); + fputs(" record not found 4\n", stderr); */ + + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; note that delete + marked records are a special case in a unique search. If there + is a non-delete marked record, then it is enough to lock its + existence with LOCK_REC_NOT_GAP. */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using a READ COMMITED isolation + level we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + + if (!set_also_gap_locks + || srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED + || (unique_search + && !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) { + + goto no_gap_lock; + } else { + lock_type = LOCK_ORDINARY; + } + + /* If we are doing a 'greater or equal than a primary key + value' search from a clustered index, and we find a record + that has that exact primary key value, then there is no need + to lock the gap before the record, because no insert in the + gap can be in our search range. That is, no phantom row can + appear that way. + + An example: if col1 is the primary key, the search is WHERE + col1 >= 100, and we find a record where col1 = 100, then no + need to lock the gap before that record. */ + + if (index == clust_index + && mode == PAGE_CUR_GE + && direction == 0 + && dtuple_get_n_fields_cmp(search_tuple) + == dict_index_get_n_unique(index) + && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) { +no_gap_lock: + lock_type = LOCK_REC_NOT_GAP; + } + + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + + switch (err) { + const rec_t* old_vers; + case DB_SUCCESS: + if (srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) { + /* Note that a record of + prebuilt->index was locked. */ + prebuilt->new_rec_locks = 1; + } + break; + case DB_LOCK_WAIT: + if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_TRY_SEMI_CONSISTENT) + || index != clust_index) { + + goto lock_wait_or_error; + } + + /* The following call returns 'offsets' + associated with 'old_vers' */ + err = row_sel_build_committed_vers_for_mysql( + clust_index, prebuilt, rec, + &offsets, &heap, &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + mutex_enter(&kernel_mutex); + if (trx->was_chosen_as_deadlock_victim) { + mutex_exit(&kernel_mutex); + err = DB_DEADLOCK; + + goto lock_wait_or_error; + } + if (UNIV_LIKELY(trx->wait_lock != NULL)) { + lock_cancel_waiting_and_release( + trx->wait_lock); + prebuilt->new_rec_locks = 0; + } else { + mutex_exit(&kernel_mutex); + + /* The lock was granted while we were + searching for the last committed version. + Do a normal locking read. */ + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, + &heap); + err = DB_SUCCESS; + /* Note that a record of + prebuilt->index was locked. */ + prebuilt->new_rec_locks = 1; + break; + } + mutex_exit(&kernel_mutex); + + if (old_vers == NULL) { + /* The row was not yet committed */ + + goto next_rec; + } + + did_semi_consistent_read = TRUE; + rec = old_vers; + break; + default: + + goto lock_wait_or_error; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + + /* Do nothing: we let a non-locking SELECT read the + latest version of the record */ + + } else if (index == clust_index) { + + /* Fetch a previous version of the row if the current + one is not visible in the snapshot; if we have a very + high force recovery level set, we try to avoid crashes + by skipping this lookup */ + + if (UNIV_LIKELY(srv_force_recovery < 5) + && !lock_clust_rec_cons_read_sees( + rec, index, offsets, trx->read_view)) { + + rec_t* old_vers; + /* The following call returns 'offsets' + associated with 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, rec, &offsets, &heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, trx->read_view)) { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + ut_ad(index != clust_index); + + goto requires_clust_rec; + } + } + + /* NOTE that at this point rec can be an old version of a clustered + index record built for a consistent read. We cannot assume after this + point that rec is on a buffer pool page. Functions like + page_rec_is_comp() cannot be used! */ + + if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) { + + /* The record is delete-marked: we can skip it */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE + && !did_semi_consistent_read) { + + /* No need to keep a lock on a delete-marked record + if we do not want to use next-key locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + /* This is an optimization to skip setting the next key lock + on the record that follows this delete-marked record. This + optimization works because of the unique search criteria + which precludes the presence of a range lock between this + delete marked record and the record following it. + + For now this is applicable only to clustered indexes while + doing a unique search. There is scope for further optimization + applicable to unique secondary indexes. Current behaviour is + to widen the scope of a lock on an already delete marked record + if the same record is deleted twice by the same transaction */ + if (index == clust_index && unique_search) { + err = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + + goto next_rec; + } + + /* Get the clustered index record if needed, if we did not do the + search using the clustered index. */ + + if (index != clust_index && prebuilt->need_to_access_clustered) { + +requires_clust_rec: + /* We use a 'goto' to the preceding label if a consistent + read of a secondary index record requires us to look up old + versions of the associated clustered index record. */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_has_extra_clust_latch = TRUE; + + /* The following call returns 'offsets' associated with + 'clust_rec'. Note that 'clust_rec' can be an old version + built for a consistent read. */ + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, + &offsets, &heap, &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE); + + goto next_rec; + } + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + /* Note that both the secondary index record + and the clustered index record were locked. */ + ut_ad(prebuilt->new_rec_locks == 1); + prebuilt->new_rec_locks = 2; + } + + if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) { + + /* The record is delete marked: we can skip it */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* No need to keep a lock on a delete-marked + record if we do not want to use next-key + locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + goto next_rec; + } + + if (prebuilt->need_to_access_clustered) { + + result_rec = clust_rec; + + ut_ad(rec_offs_validate(result_rec, clust_index, + offsets)); + } else { + /* We used 'offsets' for the clust rec, recalculate + them for 'rec' */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + result_rec = rec; + } + } else { + result_rec = rec; + } + + /* We found a qualifying record 'result_rec'. At this point, + 'offsets' are associated with 'result_rec'. */ + + ut_ad(rec_offs_validate(result_rec, + result_rec != rec ? clust_index : index, + offsets)); + + /* At this point, the clustered index record is protected + by a page latch that was acquired when pcur was positioned. + The latch will not be released until mtr_commit(&mtr). */ + + if ((match_mode == ROW_SEL_EXACT + || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD) + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->templ_contains_blob + && !prebuilt->clust_index_was_generated + && !prebuilt->used_in_HANDLER + && prebuilt->template_type + != ROW_MYSQL_DUMMY_TEMPLATE) { + + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE. + Since we keep space in prebuilt only for the BLOBs of + a single row, we cannot cache rows in the case there + are BLOBs in the fields to be fetched. In HANDLER we do + not cache rows because there the cursor is a scrollable + cursor. */ + + row_sel_push_cache_row_for_mysql(prebuilt, result_rec, + offsets); + if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { + + goto got_row; + } + + goto next_rec; + } else { + if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) { + memcpy(buf + 4, result_rec + - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); + } else { + if (!row_sel_store_mysql_rec(buf, prebuilt, + result_rec, offsets)) { + err = DB_TOO_BIG_RECORD; + + goto lock_wait_or_error; + } + } + + if (prebuilt->clust_index_was_generated) { + if (result_rec != rec) { + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, + &heap); + } + row_sel_store_row_id_to_prebuilt(prebuilt, rec, + index, offsets); + } + } + + /* From this point on, 'offsets' are invalid. */ + +got_row: + /* We have an optimization to save CPU time: if this is a consistent + read on a unique condition on the clustered index, then we do not + store the pcur position, because any fetch next or prev will anyway + return 'end of file'. Exceptions are locking reads and the MySQL + HANDLER command where the user can move the cursor with PREV or NEXT + even after a unique search. */ + + if (!unique_search_from_clust_index + || prebuilt->select_lock_type != LOCK_NONE + || prebuilt->used_in_HANDLER) { + + /* Inside an update always store the cursor position */ + + btr_pcur_store_position(pcur, &mtr); + } + + err = DB_SUCCESS; + + goto normal_return; + +next_rec: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = FALSE; + prebuilt->new_rec_locks = 0; + + /*-------------------------------------------------------------*/ + /* PHASE 5: Move the cursor to the next index record */ + + if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) { + /* We must commit mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + mtr_start(&mtr); + if (sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, + pcur, moves_up, &mtr)) { +#ifdef UNIV_SEARCH_DEBUG + cnt++; +#endif /* UNIV_SEARCH_DEBUG */ + + goto rec_loop; + } + } + + if (moves_up) { + if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) { +not_moved: + btr_pcur_store_position(pcur, &mtr); + + if (match_mode != 0) { + err = DB_RECORD_NOT_FOUND; + } else { + err = DB_END_OF_INDEX; + } + + goto normal_return; + } + } else { + if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) { + goto not_moved; + } + } + +#ifdef UNIV_SEARCH_DEBUG + cnt++; +#endif /* UNIV_SEARCH_DEBUG */ + + goto rec_loop; + +lock_wait_or_error: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = FALSE; + + /*-------------------------------------------------------------*/ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + trx->error_state = err; + + /* The following is a patch for MySQL */ + + que_thr_stop_for_mysql(thr); + + thr->lock_state = QUE_THR_LOCK_ROW; + + if (row_mysql_handle_errors(&err, trx, thr, NULL)) { + /* It was a lock wait, and it ended */ + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + mtr_start(&mtr); + + sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && !same_user_rec) { + + /* Since we were not able to restore the cursor + on the same user record, we cannot use + row_unlock_for_mysql() to unlock any records, and + we must thus reset the new rec lock info. Since + in lock0lock.c we have blocked the inheriting of gap + X-locks, we actually do not have any new record locks + set in this case. + + Note that if we were able to restore on the 'same' + user record, it is still possible that we were actually + waiting on a delete-marked record, and meanwhile + it was removed by purge and inserted again by some + other user. But that is no problem, because in + rec_loop we will again try to set a lock, and + new_rec_lock_info in trx will be right at the end. */ + + prebuilt->new_rec_locks = 0; + } + + mode = pcur->search_mode; + + goto rec_loop; + } + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + +#ifdef UNIV_SEARCH_DEBUG + /* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ +#endif /* UNIV_SEARCH_DEBUG */ + goto func_exit; + +normal_return: + /*-------------------------------------------------------------*/ + que_thr_stop_for_mysql_no_error(thr, trx); + + mtr_commit(&mtr); + + if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + err = DB_SUCCESS; + } + +#ifdef UNIV_SEARCH_DEBUG + /* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ +#endif /* UNIV_SEARCH_DEBUG */ + if (err == DB_SUCCESS) { + srv_n_rows_read++; + } + +func_exit: + trx->op_info = ""; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Set or reset the "did semi-consistent read" flag on return. + The flag did_semi_consistent_read is set if and only if + the record being returned was fetched with a semi-consistent read. */ + ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS + || !did_semi_consistent_read); + + if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) { + if (UNIV_UNLIKELY(did_semi_consistent_read)) { + prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + } + return(err); +} + +/*******************************************************************//** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. +@return TRUE if storing or retrieving from the query cache is permitted */ +UNIV_INTERN +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + trx_t* trx, /*!< in: transaction object */ + const char* norm_name) /*!< in: concatenation of database name, + '/' char, table name */ +{ + dict_table_t* table; + ibool ret = FALSE; + + table = dict_table_get(norm_name, FALSE); + + if (table == NULL) { + + return(FALSE); + } + + mutex_enter(&kernel_mutex); + + /* Start the transaction if it is not started yet */ + + trx_start_if_not_started_low(trx); + + /* If there are locks on the table or some trx has invalidated the + cache up to our trx id, then ret = FALSE. + We do not check what type locks there are on the table, though only + IX type locks actually would require ret = FALSE. */ + + if (UT_LIST_GET_LEN(table->locks) == 0 + && ut_dulint_cmp(trx->id, + table->query_cache_inv_trx_id) >= 0) { + + ret = TRUE; + + /* If the isolation level is high, assign a read view for the + transaction if it does not yet have one */ + + if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ + && !trx->read_view) { + + trx->read_view = read_view_open_now( + trx->id, trx->global_read_view_heap); + trx->global_read_view = trx->read_view; + } + } + + mutex_exit(&kernel_mutex); + + return(ret); +} + +/*******************************************************************//** +Read the AUTOINC column from the current row. If the value is less than +0 and the type is not unsigned then we reset the value to 0. +@return value read from the column */ +static +ib_uint64_t +row_search_autoinc_read_column( +/*===========================*/ + dict_index_t* index, /*!< in: index to read from */ + const rec_t* rec, /*!< in: current rec */ + ulint col_no, /*!< in: column number */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + ulint len; + const byte* data; + ib_uint64_t value; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + data = rec_get_nth_field(rec, offsets, col_no, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len <= sizeof value); + + /* we assume AUTOINC value cannot be negative */ + value = mach_read_int_type(data, len, unsigned_type); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (!unsigned_type && (ib_int64_t) value < 0) { + value = 0; + } + + return(value); +} + +/*******************************************************************//** +Get the last row. +@return current rec or NULL */ +static +const rec_t* +row_search_autoinc_get_rec( +/*=======================*/ + btr_pcur_t* pcur, /*!< in: the current cursor */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + do { + const rec_t* rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_user_rec(rec)) { + return(rec); + } + } while (btr_pcur_move_to_prev(pcur, mtr)); + + return(NULL); +} + +/*******************************************************************//** +Read the max AUTOINC value from an index. +@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if +column name can't be found in index */ +UNIV_INTERN +ulint +row_search_max_autoinc( +/*===================*/ + dict_index_t* index, /*!< in: index to search */ + const char* col_name, /*!< in: name of autoinc column */ + ib_uint64_t* value) /*!< out: AUTOINC value read */ +{ + ulint i; + ulint n_cols; + dict_field_t* dfield = NULL; + ulint error = DB_SUCCESS; + + n_cols = dict_index_get_n_ordering_defined_by_user(index); + + /* Search the index for the AUTOINC column name */ + for (i = 0; i < n_cols; ++i) { + dfield = dict_index_get_nth_field(index, i); + + if (strcmp(col_name, dfield->name) == 0) { + break; + } + } + + *value = 0; + + /* Must find the AUTOINC column name */ + if (i < n_cols && dfield) { + mtr_t mtr; + btr_pcur_t pcur; + + mtr_start(&mtr); + + /* Open at the high/right end (FALSE), and INIT + cursor (TRUE) */ + btr_pcur_open_at_index_side( + FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + + if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) { + const rec_t* rec; + + rec = row_search_autoinc_get_rec(&pcur, &mtr); + + if (rec != NULL) { + ibool unsigned_type = ( + dfield->col->prtype & DATA_UNSIGNED); + + *value = row_search_autoinc_read_column( + index, rec, i, unsigned_type); + } + } + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + } else { + error = DB_RECORD_NOT_FOUND; + } + + return(error); +} diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.c new file mode 100644 index 00000000000..9f9c814f1a5 --- /dev/null +++ b/storage/innobase/row/row0uins.c @@ -0,0 +1,350 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0uins.c +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" + +#ifdef UNIV_NONINL +#include "row0uins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" + +/***************************************************************//** +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_ins_remove_clust_rec( +/*==========================*/ + undo_node_t* node) /*!< in: undo node */ +{ + btr_cur_t* btr_cur; + ibool success; + ulint err; + ulint n_tries = 0; + mtr_t mtr; + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), + &mtr); + ut_a(success); + + if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Drop the index tree associated with the row in + SYS_INDEXES table: */ + + dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); + + mtr_commit(&mtr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, + &(node->pcur), &mtr); + ut_a(success); + } + + btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); + + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (success) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_TREE, + &(node->pcur), &mtr); + ut_a(success); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + trx_is_recv(node->trx) + ? RB_RECOVERY + : RB_NORMAL, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + trx_undo_rec_release(node->trx, node->undo_no); + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_ins_remove_sec_low( +/*========================*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry to remove */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool found; + ibool success; + ulint err; + mtr_t mtr; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* No need to distinguish RB_RECOVERY here, because we + are deleting a secondary index record: the distinction + between RB_NORMAL and RB_RECOVERY only matters when + deleting a record that contains externally stored + columns. */ + ut_ad(!dict_index_is_clust(index)); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NORMAL, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_ins_remove_sec( +/*====================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry to insert */ +{ + ulint err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/***********************************************************//** +Parses the row reference and other info in a fresh insert undo record. */ +static +void +row_undo_ins_parse_undo_rec( +/*========================*/ + undo_node_t* node) /*!< in/out: row undo node */ +{ + dict_index_t* clust_index; + byte* ptr; + undo_no_t undo_no; + dulint table_id; + ulint type; + ulint dummy; + ibool dummy_extern; + + ut_ad(node); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, + &dummy_extern, &undo_no, &table_id); + ut_ad(type == TRX_UNDO_INSERT_REC); + node->rec_type = type; + + node->update = NULL; + node->table = dict_table_get_on_id(table_id, node->trx); + + /* Skip the UNDO if we can't find the table or the .ibd file. */ + if (UNIV_UNLIKELY(node->table == NULL)) { + } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) { + node->table = NULL; + } else { + clust_index = dict_table_get_first_index(node->table); + + if (clust_index != NULL) { + ptr = trx_undo_rec_get_row_ref( + ptr, clust_index, &node->ref, node->heap); + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: table "); + ut_print_name(stderr, node->trx, TRUE, + node->table->name); + fprintf(stderr, " has no indexes, " + "ignoring the table\n"); + + node->table = NULL; + } + } +} + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +ulint +row_undo_ins( +/*=========*/ + undo_node_t* node) /*!< in: row undo node */ +{ + ut_ad(node); + ut_ad(node->state == UNDO_NODE_INSERT); + + row_undo_ins_parse_undo_rec(node); + + if (!node->table || !row_undo_search_clust_to_pcur(node)) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } + + /* Iterate over all the indexes and undo the insert.*/ + + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + while (node->index != NULL) { + dtuple_t* entry; + ulint err; + + entry = row_build_index_entry(node->row, node->ext, + node->index, node->heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record. Because secondary index entries + are inserted after the clustered index record, + we may assume that the secondary index record + does not exist. However, this situation may + only occur during the rollback of incomplete + transactions. */ + ut_a(trx_is_recv(node->trx)); + } else { + err = row_undo_ins_remove_sec(node->index, entry); + + if (err != DB_SUCCESS) { + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } + + return(row_undo_ins_remove_clust_rec(node)); +} diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c new file mode 100644 index 00000000000..6be475d8c78 --- /dev/null +++ b/storage/innobase/row/row0umod.c @@ -0,0 +1,815 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0umod.c +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#include "row0umod.h" + +#ifdef UNIV_NONINL +#include "row0umod.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "log0log.h" + +/* Considerations on undoing a modify operation. +(1) Undoing a delete marking: all index records should be found. Some of +them may have delete mark already FALSE, if the delete mark operation was +stopped underway, or if the undo operation ended prematurely because of a +system crash. +(2) Undoing an update of a delete unmarked record: the newer version of +an updated secondary index entry should be removed if no prior version +of the clustered index record requires its existence. Otherwise, it should +be delete marked. +(3) Undoing an update of a delete marked record. In this kind of update a +delete marked clustered index record was delete unmarked and possibly also +some of its fields were changed. Now, it is possible that the delete marked +version has become obsolete at the time the undo is started. */ + +/***********************************************************//** +Checks if also the previous version of the clustered index record was +modified or inserted by the same transaction, and its undo number is such +that it should be undone in the same rollback. +@return TRUE if also previous modify or insert of this row should be undone */ +UNIV_INLINE +ibool +row_undo_mod_undo_also_prev_vers( +/*=============================*/ + undo_node_t* node, /*!< in: row undo node */ + undo_no_t* undo_no)/*!< out: the undo number */ +{ + trx_undo_rec_t* undo_rec; + trx_t* trx; + + trx = node->trx; + + if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { + + *undo_no = ut_dulint_zero; + return(FALSE); + } + + undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); + + *undo_no = trx_undo_rec_get_undo_no(undo_rec); + + return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0); +} + +/***********************************************************//** +Undoes a modify in a clustered index record. +@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ +static +ulint +row_undo_mod_clust_low( +/*===================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in: mtr; must be committed before + latching any further pages */ + ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + ut_ad(success); + + if (mode == BTR_MODIFY_LEAF) { + + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + mem_heap_t* heap = NULL; + big_rec_t* dummy_big_rec; + + ut_ad(mode == BTR_MODIFY_TREE); + + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, &heap, &dummy_big_rec, node->update, + node->cmpl_info, thr, mtr); + + ut_a(!dummy_big_rec); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + return(err); +} + +/***********************************************************//** +Removes a clustered index record after undo if possible. +@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ +static +ulint +row_undo_mod_remove_clust_low( +/*==========================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr __attribute__((unused)), /*!< in: query thread */ + mtr_t* mtr, /*!< in: mtr */ + ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + if (!success) { + + return(DB_SUCCESS); + } + + /* Find out if we can remove the whole clustered index record */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC + && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { + + /* Ok, we can remove */ + } else { + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* Note that since this operation is analogous to purge, + we can free also inherited externally stored fields: + hence the RB_NONE in the call below */ + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NONE, mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + + return(err); +} + +/***********************************************************//** +Undoes a modify in a clustered index record. Sets also the node state for the +next round of undo. +@return DB_SUCCESS or error code: we may run out of file space */ +static +ulint +row_undo_mod_clust( +/*===============*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_pcur_t* pcur; + mtr_t mtr; + ulint err; + ibool success; + ibool more_vers; + undo_no_t new_undo_no; + + ut_ad(node && thr); + + /* Check if also the previous version of the clustered index record + should be undone in this same rollback operation */ + + more_vers = row_undo_mod_undo_also_prev_vers(node, &new_undo_no); + + pcur = &(node->pcur); + + mtr_start(&mtr); + + /* Try optimistic processing of the record, keeping changes within + the index page */ + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF); + + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a pessimistic + descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_LEAF); + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a + pessimistic descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + } + + node->state = UNDO_NODE_FETCH_NEXT; + + trx_undo_rec_release(node->trx, node->undo_no); + + if (more_vers && err == DB_SUCCESS) { + + /* Reserve the undo log record to the prior version after + committing &mtr: this is necessary to comply with the latching + order, as &mtr may contain the fsp latch which is lower in + the latch hierarchy than trx->undo_mutex. */ + + success = trx_undo_rec_reserve(node->trx, new_undo_no); + + if (success) { + node->state = UNDO_NODE_PREV_VERS; + } + } + + return(err); +} + +/***********************************************************//** +Delete marks or removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_mod_del_mark_or_remove_sec_low( +/*====================================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry */ + ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + ibool found; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has; + ulint err; + mtr_t mtr; + mtr_t mtr_vers; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* In crash recovery, the secondary index record may + be missing if the UPDATE did not have time to insert + the secondary index records before the crash. When we + are undoing that UPDATE in crash recovery, the record + may be missing. + + In normal processing, if an update ends in a deadlock + before it has inserted all updated secondary index + records, then the undo will not find those records. */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + /* We should remove the index record if no prior version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should delete mark the record. */ + + mtr_start(&mtr_vers); + + success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur), + &mtr_vers); + ut_a(success); + + old_has = row_vers_old_has_index_entry(FALSE, + btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + if (old_has) { + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, &mtr); + ut_ad(err == DB_SUCCESS); + } else { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* No need to distinguish RB_RECOVERY here, because we + are deleting a secondary index record: the distinction + between RB_NORMAL and RB_RECOVERY only matters when + deleting a record that contains externally stored + columns. */ + ut_ad(!dict_index_is_clust(index)); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NORMAL, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/***********************************************************//** +Delete marks or removes a secondary index entry if found. +NOTE that if we updated the fields of a delete-marked secondary index record +so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot +return to the original values because we do not know them. But this should +not cause problems because in row0sel.c, in queries we always retrieve the +clustered index record or an earlier version of it, if the secondary index +record through which we do the search is delete-marked. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_mod_del_mark_or_remove_sec( +/*================================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + ulint err; + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_LEAF); + if (err == DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_TREE); + return(err); +} + +/***********************************************************//** +Delete unmarks a secondary index entry which must be found. It might not be +delete-marked at the moment, but it does not harm to unmark it anyway. We also +need to update the fields of the secondary index record if we updated its +fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. +@return DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_mod_del_unmark_sec_and_undo_update( +/*========================================*/ + ulint mode, /*!< in: search mode: BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + upd_t* update; + ulint err = DB_SUCCESS; + big_rec_t* dummy_big_rec; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + + /* Ignore indexes that are being created. */ + if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) { + + return(DB_SUCCESS); + } + + log_free_check(); + mtr_start(&mtr); + + if (UNIV_UNLIKELY(!row_search_index_entry(index, entry, + mode, &pcur, &mtr))) { + fputs("InnoDB: error in sec index entry del undo in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); + putc('\n', stderr); + trx_print(stderr, trx, 0); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } else { + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); + ut_a(err == DB_SUCCESS); + heap = mem_heap_create(100); + + update = row_upd_build_sec_rec_difference_binary( + index, entry, btr_cur_get_rec(btr_cur), trx, heap); + if (upd_get_n_fields(update) == 0) { + + /* Do nothing */ + + } else if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping + changes within the page */ + + err = btr_cur_optimistic_update( + BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, + btr_cur, update, 0, thr, &mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update( + BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, + btr_cur, &heap, &dummy_big_rec, + update, 0, thr, &mtr); + ut_a(!dummy_big_rec); + } + + mem_heap_free(heap); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is UPD_DEL. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_mod_upd_del_sec( +/*=====================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err = DB_SUCCESS; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, node->ext, + index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record. Because secondary index entries + are inserted after the clustered index record, + we may assume that the secondary index record + does not exist. However, this situation may + only occur during the rollback of incomplete + transactions. */ + ut_a(trx_is_recv(thr_get_trx(thr))); + } else { + err = row_undo_mod_del_mark_or_remove_sec( + node, thr, index, entry); + + if (err != DB_SUCCESS) { + + break; + } + } + + mem_heap_empty(heap); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is DEL_MARK. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_mod_del_mark_sec( +/*======================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, node->ext, + index, heap); + ut_a(entry); + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err != DB_SUCCESS) { + + mem_heap_free(heap); + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is UPD_EXIST. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static +ulint +row_undo_mod_upd_exist_sec( +/*=======================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + /* No change in secondary indexes */ + + return(DB_SUCCESS); + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field_binary(node->row, node->index, + node->update)) { + + /* Build the newest version of the index entry */ + entry = row_build_index_entry(node->row, node->ext, + index, heap); + ut_a(entry); + /* NOTE that if we updated the fields of a + delete-marked secondary index record so that + alphabetically they stayed the same, e.g., + 'abc' -> 'aBc', we cannot return to the original + values because we do not know them. But this should + not cause problems because in row0sel.c, in queries + we always retrieve the clustered index record or an + earlier version of it, if the secondary index record + through which we do the search is delete-marked. */ + + err = row_undo_mod_del_mark_or_remove_sec(node, thr, + index, + entry); + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + + /* We may have to update the delete mark in the + secondary index record of the previous version of + the row. We also need to update the fields of + the secondary index record if we updated its fields + but alphabetically they stayed the same, e.g., + 'abc' -> 'aBc'. */ + mem_heap_empty(heap); + entry = row_build_index_entry(node->undo_row, + node->undo_ext, + index, heap); + ut_a(entry); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/***********************************************************//** +Parses the row reference and other info in a modify undo log record. */ +static +void +row_undo_mod_parse_undo_rec( +/*========================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + undo_no_t undo_no; + dulint table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + ibool dummy_extern; + trx_t* trx; + + ut_ad(node && thr); + trx = thr_get_trx(thr); + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + node->rec_type = type; + + node->table = dict_table_get_on_id(table_id, trx); + + /* TODO: other fixes associated with DROP TABLE + rollback in the + same table by another user */ + + if (node->table == NULL) { + /* Table was dropped */ + return; + } + + if (node->table->ibd_file_missing) { + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + node->new_roll_ptr = roll_ptr; + node->new_trx_id = trx_id; + node->cmpl_info = cmpl_info; +} + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ulint +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + ut_ad(node->state == UNDO_NODE_MODIFY); + + row_undo_mod_parse_undo_rec(node, thr); + + if (!node->table || !row_undo_search_clust_to_pcur(node)) { + /* It is already undone, or will be undone by another query + thread, or table was dropped */ + + trx_undo_rec_release(node->trx, node->undo_no); + node->state = UNDO_NODE_FETCH_NEXT; + + return(DB_SUCCESS); + } + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + err = row_undo_mod_upd_exist_sec(node, thr); + + } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + + err = row_undo_mod_del_mark_sec(node, thr); + } else { + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + err = row_undo_mod_upd_del_sec(node, thr); + } + + if (err != DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_clust(node, thr); + + return(err); +} diff --git a/storage/innobase/row/row0undo.c b/storage/innobase/row/row0undo.c new file mode 100644 index 00000000000..3d739c9689a --- /dev/null +++ b/storage/innobase/row/row0undo.c @@ -0,0 +1,377 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0undo.c +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#include "row0undo.h" + +#ifdef UNIV_NONINL +#include "row0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0uins.h" +#include "row0umod.h" +#include "row0upd.h" +#include "row0mysql.h" +#include "srv0srv.h" + +/* How to undo row operations? +(1) For an insert, we have stored a prefix of the clustered index record +in the undo log. Using it, we look for the clustered record, and using +that we look for the records in the secondary indexes. The insert operation +may have been left incomplete, if the database crashed, for example. +We may have look at the trx id and roll ptr to make sure the record in the +clustered index is really the one for which the undo log record was +written. We can use the framework we get from the original insert op. +(2) Delete marking: We can use the framework we get from the original +delete mark op. We only have to check the trx id. +(3) Update: This may be the most complicated. We have to use the framework +we get from the original update op. + +What if the same trx repeatedly deletes and inserts an identical row. +Then the row id changes and also roll ptr. What if the row id was not +part of the ordering fields in the clustered index? Maybe we have to write +it to undo log. Well, maybe not, because if we order the row id and trx id +in descending order, then the only undeleted copy is the first in the +index. Our searches in row operations always position the cursor before +the first record in the result set. But, if there is no key defined for +a table, then it would be desirable that row id is in ascending order. +So, lets store row id in descending order only if it is not an ordering +field in the clustered index. + +NOTE: Deletes and inserts may lead to situation where there are identical +records in a secondary index. Is that a problem in the B-tree? Yes. +Also updates can lead to this, unless trx id and roll ptr are included in +ord fields. +(1) Fix in clustered indexes: include row id, trx id, and roll ptr +in node pointers of B-tree. +(2) Fix in secondary indexes: include all fields in node pointers, and +if an entry is inserted, check if it is equal to the right neighbor, +in which case update the right neighbor: the neighbor must be delete +marked, set it unmarked and write the trx id of the current transaction. + +What if the same trx repeatedly updates the same row, updating a secondary +index field or not? Updating a clustered index ordering field? + +(1) If it does not update the secondary index and not the clustered index +ord field. Then the secondary index record stays unchanged, but the +trx id in the secondary index record may be smaller than in the clustered +index record. This is no problem? +(2) If it updates secondary index ord field but not clustered: then in +secondary index there are delete marked records, which differ in an +ord field. No problem. +(3) Updates clustered ord field but not secondary, and secondary index +is unique. Then the record in secondary index is just updated at the +clustered ord field. +(4) + +Problem with duplicate records: +Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a +bigger trx id has inserted and delete marked a similar row, our trx inserts +again a similar row, and a trx with an even bigger id delete marks it. Then +the position of the row should change in the index if the trx id affects +the alphabetical ordering. + +Fix 2: If an insert encounters a similar row marked deleted, we turn the +insert into an 'update' of the row marked deleted. Then we must write undo +info on the update. A problem: what if a purge operation tries to remove +the delete marked row? + +We can think of the database row versions as a linked list which starts +from the record in the clustered index, and is linked by roll ptrs +through undo logs. The secondary index records are references which tell +what kinds of records can be found in this linked list for a record +in the clustered index. + +How to do the purge? A record can be removed from the clustered index +if its linked list becomes empty, i.e., the row has been marked deleted +and its roll ptr points to the record in the undo log we are going through, +doing the purge. Similarly, during a rollback, a record can be removed +if the stored roll ptr in the undo log points to a trx already (being) purged, +or if the roll ptr is NULL, i.e., it was a fresh insert. */ + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +UNIV_INTERN +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + undo_node_t* undo; + + ut_ad(trx && parent && heap); + + undo = mem_heap_alloc(heap, sizeof(undo_node_t)); + + undo->common.type = QUE_NODE_UNDO; + undo->common.parent = parent; + + undo->state = UNDO_NODE_FETCH_NEXT; + undo->trx = trx; + + btr_pcur_init(&(undo->pcur)); + + undo->heap = mem_heap_create(256); + + return(undo); +} + +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return TRUE if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +UNIV_INTERN +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node) /*!< in: row undo node */ +{ + dict_index_t* clust_index; + ibool found; + mtr_t mtr; + ibool ret; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + mtr_start(&mtr); + + clust_index = dict_table_get_first_index(node->table); + + found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, + node->table, node->ref, &mtr); + + rec = btr_pcur_get_rec(&(node->pcur)); + + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, + row_get_rec_roll_ptr(rec, clust_index, + offsets))) { + + /* We must remove the reservation on the undo log record + BEFORE releasing the latch on the clustered index page: this + is to make sure that some thread will eventually undo the + modification corresponding to node->roll_ptr. */ + + /* fputs("--------------------undoing a previous version\n", + stderr); */ + + ret = FALSE; + } else { + node->row = row_build(ROW_COPY_DATA, clust_index, rec, + offsets, NULL, &node->ext, node->heap); + if (node->update) { + node->undo_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->undo_row, &node->undo_ext, + clust_index, node->update, node->heap); + } else { + node->undo_row = NULL; + node->undo_ext = NULL; + } + + btr_pcur_store_position(&(node->pcur), &mtr); + + ret = TRUE; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ret); +} + +/***********************************************************//** +Fetches an undo log record and does the undo for the recorded operation. +If none left, or a partial rollback completed, returns control to the +parent node, which is always a query thread node. +@return DB_SUCCESS if operation successfully completed, else error code */ +static +ulint +row_undo( +/*=====*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + trx_t* trx; + roll_ptr_t roll_ptr; + ibool locked_data_dict; + + ut_ad(node && thr); + + trx = node->trx; + + if (node->state == UNDO_NODE_FETCH_NEXT) { + + node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, + trx->roll_limit, + &roll_ptr, + node->heap); + if (!node->undo_rec) { + /* Rollback completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + + } else if (node->state == UNDO_NODE_PREV_VERS) { + + /* Undo should be done to the same clustered index record + again in this same rollback, restoring the previous version */ + + roll_ptr = node->new_roll_ptr; + + node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr, + node->heap); + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + } + + /* Prevent DROP TABLE etc. while we are rolling back this row. + If we are doing a TABLE CREATE or some other dictionary operation, + then we already have dict_operation_lock locked in x-mode. Do not + try to lock again, because that would cause a hang. */ + + locked_data_dict = (trx->dict_operation_lock_mode == 0); + + if (locked_data_dict) { + + row_mysql_lock_data_dictionary(trx); + } + + if (node->state == UNDO_NODE_INSERT) { + + err = row_undo_ins(node); + + node->state = UNDO_NODE_FETCH_NEXT; + } else { + ut_ad(node->state == UNDO_NODE_MODIFY); + err = row_undo_mod(node, thr); + } + + if (locked_data_dict) { + + row_mysql_unlock_data_dictionary(trx); + } + + /* Do some cleanup */ + btr_pcur_close(&(node->pcur)); + + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(err); +} + +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err; + undo_node_t* node; + trx_t* trx; + + ut_ad(thr); + + srv_activity_count++; + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); + + err = row_undo(node, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* SQL error detected */ + + fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", + (ulong) err); + + if (err == DB_OUT_OF_FILE_SPACE) { + fprintf(stderr, + "InnoDB: Error 13 means out of tablespace.\n" + "InnoDB: Consider increasing" + " your tablespace.\n"); + + exit(1); + } + + ut_error; + + return(NULL); + } + + return(thr); +} diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c new file mode 100644 index 00000000000..58dfd43ead9 --- /dev/null +++ b/storage/innobase/row/row0upd.c @@ -0,0 +1,2177 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0upd.c +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "row0upd.h" + +#ifdef UNIV_NONINL +#include "row0upd.ic" +#endif + +#include "dict0dict.h" +#include "trx0undo.h" +#include "rem0rec.h" +#ifndef UNIV_HOTBACKUP +#include "dict0boot.h" +#include "dict0crea.h" +#include "mach0data.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0ins.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0sym.h" +#include "eval0eval.h" +#include "buf0lru.h" + + +/* What kind of latch and lock can we assume when the control comes to + ------------------------------------------------------------------- +an update node? +-------------- +Efficiency of massive updates would require keeping an x-latch on a +clustered index page through many updates, and not setting an explicit +x-lock on clustered index records, as they anyway will get an implicit +x-lock when they are updated. A problem is that the read nodes in the +graph should know that they must keep the latch when passing the control +up to the update node, and not set any record lock on the record which +will be updated. Another problem occurs if the execution is stopped, +as the kernel switches to another query thread, or the transaction must +wait for a lock. Then we should be able to release the latch and, maybe, +acquire an explicit x-lock on the record. + Because this seems too complicated, we conclude that the less +efficient solution of releasing all the latches when the control is +transferred to another node, and acquiring explicit x-locks, is better. */ + +/* How is a delete performed? If there is a delete without an +explicit cursor, i.e., a searched delete, there are at least +two different situations: +the implicit select cursor may run on (1) the clustered index or +on (2) a secondary index. The delete is performed by setting +the delete bit in the record and substituting the id of the +deleting transaction for the original trx id, and substituting a +new roll ptr for previous roll ptr. The old trx id and roll ptr +are saved in the undo log record. Thus, no physical changes occur +in the index tree structure at the time of the delete. Only +when the undo log is purged, the index records will be physically +deleted from the index trees. + +The query graph executing a searched delete would consist of +a delete node which has as a subtree a select subgraph. +The select subgraph should return a (persistent) cursor +in the clustered index, placed on page which is x-latched. +The delete node should look for all secondary index records for +this clustered index entry and mark them as deleted. When is +the x-latch freed? The most efficient way for performing a +searched delete is obviously to keep the x-latch for several +steps of query graph execution. */ + +/***********************************************************//** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. +@return TRUE if changes */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + dtuple_t* entry, /*!< in: old value of index entry */ + dict_index_t* index, /*!< in: index of entry */ + const upd_t* update, /*!< in: update vector for the row */ + ulint n); /*!< in: how many first fields to check */ + + +/*********************************************************************//** +Checks if index currently is mentioned as a referenced index in a foreign +key constraint. + +NOTE that since we do not hold dict_operation_lock when leaving the +function, it may be that the referencing table has been dropped when +we leave this function: this function is only for heuristic use! + +@return TRUE if referenced */ +static +ibool +row_upd_index_is_referenced( +/*========================*/ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + dict_table_t* table = index->table; + dict_foreign_t* foreign; + ibool froze_data_dict = FALSE; + ibool is_referenced = FALSE; + + if (!UT_LIST_GET_FIRST(table->referenced_list)) { + + return(FALSE); + } + + if (trx->dict_operation_lock_mode == 0) { + row_mysql_freeze_data_dictionary(trx); + froze_data_dict = TRUE; + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + if (foreign->referenced_index == index) { + + is_referenced = TRUE; + goto func_exit; + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + +func_exit: + if (froze_data_dict) { + row_mysql_unfreeze_data_dictionary(trx); + } + + return(is_referenced); +} + +/*********************************************************************//** +Checks if possible foreign key constraints hold after a delete of the record +under pcur. + +NOTE that this function will temporarily commit mtr and lose the +pcur position! + +@return DB_SUCCESS or an error code */ +static +ulint +row_upd_check_references_constraints( +/*=================================*/ + upd_node_t* node, /*!< in: row update node */ + btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /*!< in: table in question */ + dict_index_t* index, /*!< in: index of the cursor */ + ulint* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + trx_t* trx; + const rec_t* rec; + ulint n_ext; + ulint err; + ibool got_s_lock = FALSE; + + if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) { + + return(DB_SUCCESS); + } + + trx = thr_get_trx(thr); + + rec = btr_pcur_get_rec(pcur); + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, + &n_ext, heap); + + mtr_commit(mtr); + + mtr_start(mtr); + + if (trx->dict_operation_lock_mode == 0) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->referenced_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary( + entry, index, node->update, + foreign->n_fields))) { + + if (foreign->foreign_table == NULL) { + dict_table_get(foreign->foreign_table_name, + FALSE); + } + + if (foreign->foreign_table) { + mutex_enter(&(dict_sys->mutex)); + + (foreign->foreign_table + ->n_foreign_key_checks_running)++; + + mutex_exit(&(dict_sys->mutex)); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects 'foreign' from + being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + FALSE, foreign, table, entry, thr); + + if (foreign->foreign_table) { + mutex_enter(&(dict_sys->mutex)); + + ut_a(foreign->foreign_table + ->n_foreign_key_checks_running > 0); + + (foreign->foreign_table + ->n_foreign_key_checks_running)--; + + mutex_exit(&(dict_sys->mutex)); + } + + if (err != DB_SUCCESS) { + + goto func_exit; + } + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + err = DB_SUCCESS; + +func_exit: + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + mem_heap_free(heap); + + return(err); +} + +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +UNIV_INTERN +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + upd_node_t* node; + + node = mem_heap_alloc(heap, sizeof(upd_node_t)); + node->common.type = QUE_NODE_UPDATE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + node->in_mysql_interface = FALSE; + + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + node->index = NULL; + node->update = NULL; + + node->foreign = NULL; + node->cascade_heap = NULL; + node->cascade_node = NULL; + + node->select = NULL; + + node->heap = mem_heap_create(128); + node->magic_n = UPD_NODE_MAGIC_N; + + node->cmpl_info = 0; + + return(node); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ +UNIV_INTERN +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint pos, /*!< in: TRX_ID position in rec */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_trx_id_and_roll_ptr( + page_zip, rec, offsets, pos, trx_id, roll_ptr); + } else { + byte* field; + ulint len; + + field = rec_get_nth_field(rec, offsets, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + trx_write_trx_id(field, trx_id); + trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr); + } +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Sets the trx id or roll ptr field of a clustered index entry. */ +UNIV_INTERN +void +row_upd_index_entry_sys_field( +/*==========================*/ + const dtuple_t* entry, /*!< in: index entry, where the memory buffers + for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ + dulint val) /*!< in: value to write */ +{ + dfield_t* dfield; + byte* field; + ulint pos; + + ut_ad(dict_index_is_clust(index)); + + pos = dict_index_get_sys_col_pos(index, type); + + dfield = dtuple_get_nth_field(entry, pos); + field = dfield_get_data(dfield); + + if (type == DATA_TRX_ID) { + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + trx_write_roll_ptr(field, val); + } +} + +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +UNIV_INTERN +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update) /*!< in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint old_len; + ulint new_len; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + new_len = dfield_get_len(new_val); + + if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) { + /* A bug fixed on Dec 31st, 2004: we looked at the + SQL NULL size from the wrong field! We may backport + this fix also to 4.0. The merge to 5.0 will be made + manually immediately after we commit this to 4.1. */ + + new_len = dict_col_get_sql_null_size( + dict_index_get_nth_col(index, + upd_field->field_no), + 0); + } + + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + + if (rec_offs_comp(offsets) + && rec_offs_nth_sql_null(offsets, + upd_field->field_no)) { + /* Note that in the compact table format, for a + variable length field, an SQL NULL will use zero + bytes in the offset array at the start of the physical + record, but a zero-length value (empty string) will + use one byte! Thus, we cannot use update-in-place + if we update an SQL NULL varchar to an empty string! */ + + old_len = UNIV_SQL_NULL; + } + + if (dfield_is_ext(new_val) || old_len != new_len + || rec_offs_nth_extern(offsets, upd_field->field_no)) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Replaces the new column values stored in the update vector to the record +given. No field size changes are allowed. */ +UNIV_INTERN +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /*!< in/out: record where replaced */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + page_zip_des_t* page_zip)/*!< in: compressed page with enough space + available, or NULL */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (rec_offs_comp(offsets)) { + rec_set_info_bits_new(rec, update->info_bits); + } else { + rec_set_info_bits_old(rec, update->info_bits); + } + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + ut_ad(!dfield_is_ext(new_val) == + !rec_offs_nth_extern(offsets, upd_field->field_no)); + + rec_set_nth_field(rec, offsets, upd_field->field_no, + dfield_get_data(new_val), + dfield_get_len(new_val)); + } + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_rec(page_zip, rec, index, offsets, 0); + } +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. +@return new pointer to mlog */ +UNIV_INTERN +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + dict_index_t* index, /*!< in: clustered index */ + trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ + byte* log_ptr,/*!< pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr __attribute__((unused))) /*!< in: mtr */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr); + + log_ptr += mach_write_compressed(log_ptr, + dict_index_get_sys_col_pos( + index, DATA_TRX_ID)); + + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + + log_ptr += mach_dulint_write_compressed(log_ptr, trx->id); + + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Parses the log data of system field values. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_parse_sys_vals( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint* pos, /*!< out: TRX_ID position in record */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr)/*!< out: roll ptr */ +{ + ptr = mach_parse_compressed(ptr, end_ptr, pos); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + DATA_ROLL_PTR_LEN) { + + return(NULL); + } + + *roll_ptr = trx_read_roll_ptr(ptr); + ptr += DATA_ROLL_PTR_LEN; + + ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id); + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Writes to the redo log the new values of the fields occurring in the index. */ +UNIV_INTERN +void +row_upd_index_write_log( +/*====================*/ + const upd_t* update, /*!< in: update vector */ + byte* log_ptr,/*!< in: pointer to mlog buffer: must + contain at least MLOG_BUF_MARGIN bytes + of free space; the buffer is closed + within this function */ + mtr_t* mtr) /*!< in: mtr into whose log to write */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf_end; + ulint i; + + n_fields = upd_get_n_fields(update); + + buf_end = log_ptr + MLOG_BUF_MARGIN; + + mach_write_to_1(log_ptr, update->info_bits); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, n_fields); + + for (i = 0; i < n_fields; i++) { + +#if MLOG_BUF_MARGIN <= 30 +# error "MLOG_BUF_MARGIN <= 30" +#endif + + if (log_ptr + 30 > buf_end) { + mlog_close(mtr, log_ptr); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + + len = dfield_get_len(new_val); + + log_ptr += mach_write_compressed(log_ptr, upd_field->field_no); + log_ptr += mach_write_compressed(log_ptr, len); + + if (len != UNIV_SQL_NULL) { + if (log_ptr + len < buf_end) { + memcpy(log_ptr, dfield_get_data(new_val), len); + + log_ptr += len; + } else { + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, + dfield_get_data(new_val), + len); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + } + } + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Parses the log data written by row_upd_index_write_log. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_index_parse( +/*================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + mem_heap_t* heap, /*!< in: memory heap where update vector is + built */ + upd_t** update_out)/*!< out: update vector */ +{ + upd_t* update; + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + ulint info_bits; + ulint i; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_bits = mach_read_from_1(ptr); + ptr++; + ptr = mach_parse_compressed(ptr, end_ptr, &n_fields); + + if (ptr == NULL) { + + return(NULL); + } + + update = upd_create(n_fields, heap); + update->info_bits = info_bits; + + for (i = 0; i < n_fields; i++) { + ulint field_no; + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + ptr = mach_parse_compressed(ptr, end_ptr, &field_no); + + if (ptr == NULL) { + + return(NULL); + } + + upd_field->field_no = field_no; + + ptr = mach_parse_compressed(ptr, end_ptr, &len); + + if (ptr == NULL) { + + return(NULL); + } + + if (len != UNIV_SQL_NULL) { + + if (end_ptr < ptr + len) { + + return(NULL); + } + + dfield_set_data(new_val, + mem_heap_dup(heap, ptr, len), len); + ptr += len; + } else { + dfield_set_null(new_val); + } + } + + *update_out = update; + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +UNIV_INTERN +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: entry to insert */ + const rec_t* rec, /*!< in: secondary index record */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint i; + ulint offsets_[REC_OFFS_SMALL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + /* This function is used only for a secondary index */ + ut_a(!dict_index_is_clust(index)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE that it may be that len != dfield_get_len(dfield) if we + are updating in a character set and collation where strings of + different length can be equal in an alphabetical comparison, + and also in the case where we have a column prefix index + and the last characters in the index field are spaces; the + latter case probably caused the assertion failures reported at + row0upd.c line 713 in versions 4.0.14 - 4.0.16. */ + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (!dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + +/***************************************************************//** +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@return own: update vector of differing fields, excluding roll ptr and +trx id */ +UNIV_INTERN +upd_t* +row_upd_build_difference_binary( +/*============================*/ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* entry, /*!< in: entry to insert */ + const rec_t* rec, /*!< in: clustered index record */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint roll_ptr_pos; + ulint trx_id_pos; + ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + /* This function is used only for a clustered index */ + ut_a(dict_index_is_clust(index)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); + trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (i == trx_id_pos || i == roll_ptr_pos) { + + goto skip_compare; + } + + if (UNIV_UNLIKELY(!dfield_is_ext(dfield) + != !rec_offs_nth_extern(offsets, i)) + || !dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + n_diff++; + } +skip_compare: + ; + } + + update->n_fields = n_diff; + + return(update); +} + +/***********************************************************//** +Fetch a prefix of an externally stored column. This is similar +to row_ext_lookup(), but the row_ext_t holds the old values +of the column and must not be poisoned with the new values. +@return BLOB prefix */ +static +byte* +row_upd_ext_fetch( +/*==============*/ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part */ + ulint local_len, /*!< in: length of data, in bytes */ + ulint zip_size, /*!< in: nonzero=compressed BLOB + page size, zero for uncompressed + BLOBs */ + ulint* len, /*!< in: length of prefix to fetch; + out: fetched length of the prefix */ + mem_heap_t* heap) /*!< in: heap where to allocate */ +{ + byte* buf = mem_heap_alloc(heap, *len); + + *len = btr_copy_externally_stored_field_prefix(buf, *len, + zip_size, + data, local_len); + /* We should never update records containing a half-deleted BLOB. */ + ut_a(*len); + + return(buf); +} + +/***********************************************************//** +Replaces the new column value stored in the update vector in +the given index entry field. */ +static +void +row_upd_index_replace_new_col_val( +/*==============================*/ + dfield_t* dfield, /*!< in/out: data field + of the index entry */ + const dict_field_t* field, /*!< in: index field */ + const dict_col_t* col, /*!< in: field->col */ + const upd_field_t* uf, /*!< in: update field */ + mem_heap_t* heap, /*!< in: memory heap for allocating + and copying the new value */ + ulint zip_size)/*!< in: compressed page + size of the table, or 0 */ +{ + ulint len; + const byte* data; + + dfield_copy_data(dfield, &uf->new_val); + + if (dfield_is_null(dfield)) { + return; + } + + len = dfield_get_len(dfield); + data = dfield_get_data(dfield); + + if (field->prefix_len > 0) { + ibool fetch_ext = dfield_is_ext(dfield) + && len < (ulint) field->prefix_len + + BTR_EXTERN_FIELD_REF_SIZE; + + if (fetch_ext) { + ulint l = len; + + len = field->prefix_len; + + data = row_upd_ext_fetch(data, l, zip_size, + &len, heap); + } + + len = dtype_get_at_most_n_mbchars(col->prtype, + col->mbminlen, col->mbmaxlen, + field->prefix_len, len, + (const char*) data); + + dfield_set_data(dfield, data, len); + + if (!fetch_ext) { + dfield_dup(dfield, heap); + } + + return; + } + + switch (uf->orig_len) { + byte* buf; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(dfield, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(dfield); + /* fall through */ + case 0: + dfield_dup(dfield, heap); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + buf = mem_heap_alloc(heap, uf->orig_len); + /* Copy the locally stored prefix. */ + memcpy(buf, data, + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE); + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(dfield, buf, uf->orig_len); + dfield_set_ext(dfield); + break; + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + ibool order_only, + /*!< in: if TRUE, limit the replacement to + ordering fields of index; note that this + does not work for non-clustered indexes. */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ +{ + ulint i; + ulint n_fields; + const ulint zip_size = dict_table_zip_size(index->table); + + ut_ad(index); + + dtuple_set_info_bits(entry, update->info_bits); + + if (order_only) { + n_fields = dict_index_get_n_unique(index); + } else { + n_fields = dict_index_get_n_fields(index); + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + uf = upd_get_field_by_field_no(update, i); + + if (uf) { + row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size); + } + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ +{ + ulint i; + const dict_index_t* clust_index + = dict_table_get_first_index(index->table); + const ulint zip_size + = dict_table_zip_size(index->table); + + dtuple_set_info_bits(entry, update->info_bits); + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + uf = upd_get_field_by_field_no( + update, dict_col_get_clust_pos(col, clust_index)); + + if (uf) { + row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size); + } + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +UNIV_INTERN +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint col_no; + ulint i; + ulint n_cols; + ulint n_ext_cols; + ulint* ext_cols; + const dict_table_t* table; + + ut_ad(row); + ut_ad(ext); + ut_ad(index); + ut_ad(dict_index_is_clust(index)); + ut_ad(update); + ut_ad(heap); + + n_cols = dtuple_get_n_fields(row); + table = index->table; + ut_ad(n_cols == dict_table_get_n_cols(table)); + + ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols); + n_ext_cols = 0; + + dtuple_set_info_bits(row, update->info_bits); + + for (col_no = 0; col_no < n_cols; col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + const ulint clust_pos + = dict_col_get_clust_pos(col, index); + dfield_t* dfield; + + if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) { + + continue; + } + + dfield = dtuple_get_nth_field(row, col_no); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + const upd_field_t* upd_field + = upd_get_nth_field(update, i); + + if (upd_field->field_no != clust_pos) { + + continue; + } + + dfield_copy_data(dfield, &upd_field->new_val); + break; + } + + if (dfield_is_ext(dfield) && col->ord_part) { + ext_cols[n_ext_cols++] = col_no; + } + } + + if (n_ext_cols) { + *ext = row_ext_create(n_ext_cols, ext_cols, row, + dict_table_zip_size(table), heap); + } else { + *ext = NULL; + } +} + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +UNIV_INTERN +ibool +row_upd_changes_ord_field_binary( +/*=============================*/ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update) /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +{ + ulint n_unique; + ulint n_upd_fields; + ulint i, j; + dict_index_t* clust_index; + + ut_ad(update && index); + + n_unique = dict_index_get_n_unique(index); + n_upd_fields = upd_get_n_fields(update); + + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n_unique; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_pos; + ulint col_no; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col, clust_index); + col_no = dict_col_get_no(col); + + for (j = 0; j < n_upd_fields; j++) { + + const upd_field_t* upd_field + = upd_get_nth_field(update, j); + + /* Note that if the index field is a column prefix + then it may be that row does not contain an externally + stored part of the column value, and we cannot compare + the datas */ + + if (col_pos == upd_field->field_no + && (row == NULL + || ind_field->prefix_len > 0 + || !dfield_datas_are_binary_equal( + dtuple_get_nth_field(row, col_no), + &(upd_field->new_val)))) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +UNIV_INTERN +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update) /*!< in: update vector for the row */ +{ + upd_field_t* upd_field; + dict_index_t* index; + ulint i; + + index = dict_table_get_first_index(table); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (dict_field_get_col(dict_index_get_nth_field( + index, upd_field->field_no)) + ->ord_part) { + + return(TRUE); + } + } + + return(FALSE); +} + +/***********************************************************//** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. +@return TRUE if changes */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + dtuple_t* entry, /*!< in: index entry */ + dict_index_t* index, /*!< in: index of entry */ + const upd_t* update, /*!< in: update vector for the row */ + ulint n) /*!< in: how many first fields to check */ +{ + ulint n_upd_fields; + ulint i, j; + dict_index_t* clust_index; + + ut_ad(update && index); + ut_ad(n <= dict_index_get_n_fields(index)); + + n_upd_fields = upd_get_n_fields(update); + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_pos; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col, clust_index); + + ut_a(ind_field->prefix_len == 0); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field_t* upd_field + = upd_get_nth_field(update, j); + + if (col_pos == upd_field->field_no + && !dfield_datas_are_binary_equal( + dtuple_get_nth_field(entry, i), + &(upd_field->new_val))) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/*********************************************************************//** +Copies the column values from a record. */ +UNIV_INLINE +void +row_upd_copy_columns( +/*=================*/ + rec_t* rec, /*!< in: record in a clustered index */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + sym_node_t* column) /*!< in: first column in a column list, or + NULL */ +{ + byte* data; + ulint len; + + while (column) { + data = rec_get_nth_field(rec, offsets, + column->field_nos[SYM_CLUST_FIELD_NO], + &len); + if (len == UNIV_SQL_NULL) { + len = UNIV_SQL_NULL; + } + eval_node_copy_and_alloc_val(column, data, len); + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Calculates the new values for fields to update. Note that row_upd_copy_columns +must have been called first. */ +UNIV_INLINE +void +row_upd_eval_new_vals( +/*==================*/ + upd_t* update) /*!< in/out: update vector */ +{ + que_node_t* exp; + upd_field_t* upd_field; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + exp = upd_field->exp; + + eval_exp(exp); + + dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp)); + } +} + +/***********************************************************//** +Stores to the heap the row on which the node->pcur is positioned. */ +static +void +row_upd_store_row( +/*==============*/ + upd_node_t* node) /*!< in: row update node */ +{ + dict_index_t* clust_index; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); + + if (node->row != NULL) { + mem_heap_empty(node->heap); + } + + clust_index = dict_table_get_first_index(node->table); + + rec = btr_pcur_get_rec(node->pcur); + + offsets = rec_get_offsets(rec, clust_index, offsets_, + ULINT_UNDEFINED, &heap); + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + NULL, &node->ext, node->heap); + if (node->is_delete) { + node->upd_row = NULL; + node->upd_ext = NULL; + } else { + node->upd_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->upd_row, &node->upd_ext, + clust_index, node->update, node->heap); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***********************************************************//** +Updates a secondary index entry of a row. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +ulint +row_upd_sec_index_entry( +/*====================*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ibool check_ref; + ibool found; + dict_index_t* index; + dtuple_t* entry; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + mem_heap_t* heap; + rec_t* rec; + ulint err = DB_SUCCESS; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + + index = node->index; + + check_ref = row_upd_index_is_referenced(index, trx); + + heap = mem_heap_create(1024); + + /* Build old index entry */ + entry = row_build_index_entry(node->row, node->ext, index, heap); + ut_a(entry); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, + &mtr); + btr_cur = btr_pcur_get_btr_cur(&pcur); + + rec = btr_cur_get_rec(btr_cur); + + if (UNIV_UNLIKELY(!found)) { + fputs("InnoDB: error in sec index entry update in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + putc('\n', stderr); + + trx_print(stderr, trx, 0); + + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } else { + /* Delete mark the old index record; it can already be + delete marked if we return after a lock wait in + row_ins_index_entry below */ + + if (!rec_get_deleted_flag(rec, + dict_table_is_comp(index->table))) { + err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, + thr, &mtr); + if (err == DB_SUCCESS && check_ref) { + + ulint* offsets = rec_get_offsets( + rec, index, NULL, + ULINT_UNDEFINED, &heap); + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, &pcur, index->table, + index, offsets, thr, &mtr); + } + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->is_delete || err != DB_SUCCESS) { + + goto func_exit; + } + + /* Build a new index entry */ + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + /* Insert new index entry */ + err = row_ins_index_entry(index, entry, 0, TRUE, thr); + +func_exit: + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Updates the secondary index record if it is changed in the row update or +deletes it if this is a delete. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +UNIV_INLINE +ulint +row_upd_sec_step( +/*=============*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC) + || (node->state == UPD_NODE_UPDATE_SOME_SEC)); + ut_ad(!dict_index_is_clust(node->index)); + + if (node->state == UPD_NODE_UPDATE_ALL_SEC + || row_upd_changes_ord_field_binary(node->row, node->index, + node->update)) { + return(row_upd_sec_index_entry(node, thr)); + } + + return(DB_SUCCESS); +} + +/***********************************************************//** +Marks the clustered index record deleted and inserts the updated version +of the record to the index. This function should be used when the ordering +fields of the clustered index record change. This should be quite rare in +database applications. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +ulint +row_upd_clust_rec_by_insert( +/*========================*/ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index of the record */ + que_thr_t* thr, /*!< in: query thread */ + ibool check_ref,/*!< in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /*!< in: mtr; gets committed here */ +{ + mem_heap_t* heap = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + trx_t* trx; + dict_table_t* table; + dtuple_t* entry; + ulint err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + + trx = thr_get_trx(thr); + table = node->table; + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + if (node->state != UPD_NODE_INSERT_CLUSTERED) { + rec_t* rec; + dict_index_t* index; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + return(err); + } + + /* Mark as not-owned the externally stored fields which the new + row inherits from the delete marked record: purge should not + free those externally stored fields even if the delete marked + record is removed from the index tree, or updated. */ + + rec = btr_cur_get_rec(btr_cur); + index = dict_table_get_first_index(table); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + btr_cur_mark_extern_inherited_fields( + btr_cur_get_page_zip(btr_cur), + rec, index, offsets, node->update, mtr); + if (check_ref) { + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, pcur, table, index, offsets, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); + } + } + } + + mtr_commit(mtr); + + if (!heap) { + heap = mem_heap_create(500); + } + node->state = UPD_NODE_INSERT_CLUSTERED; + + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); + + if (node->upd_ext) { + /* If we return from a lock wait, for example, we may have + extern fields marked as not-owned in entry (marked in the + if-branch above). We must unmark them. */ + + btr_cur_unmark_dtuple_extern_fields(entry); + + /* We must mark non-updated extern fields in entry as + inherited, so that a possible rollback will not free them. */ + + btr_cur_mark_dtuple_inherited_extern(entry, node->update); + } + + err = row_ins_index_entry(index, entry, + node->upd_ext ? node->upd_ext->n_ext : 0, + TRUE, thr); + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Updates a clustered index record of a row when the ordering fields do +not change. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +ulint +row_upd_clust_rec( +/*==============*/ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; gets committed here */ +{ + mem_heap_t* heap = NULL; + big_rec_t* big_rec = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table))); + + /* Try optimistic updating of the record, keeping changes within + the page; we do not check locks because we assume the x-lock on the + record to update */ + + if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { + err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } + + mtr_commit(mtr); + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + + return(DB_SUCCESS); + } + + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + /* We may have to modify the tree structure: do a pessimistic descent + down the index tree */ + + mtr_start(mtr); + + /* NOTE: this transaction has an s-lock or x-lock on the record and + therefore other transactions cannot modify the record when we have no + latch on the page. In addition, we assume that other query threads of + the same transaction do not modify the record in the meantime. + Therefore we can assert that the restoration of the cursor succeeds. */ + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table))); + + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, + &heap, &big_rec, node->update, + node->cmpl_info, thr, mtr); + mtr_commit(mtr); + + if (err == DB_SUCCESS && big_rec) { + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec; + rec_offs_init(offsets_); + + mtr_start(mtr); + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + rec = btr_cur_get_rec(btr_cur); + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(btr_cur), rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + big_rec, mtr); + mtr_commit(mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (big_rec) { + dtuple_big_rec_free(big_rec); + } + + return(err); +} + +/***********************************************************//** +Delete marks a clustered index record. +@return DB_SUCCESS if operation successfully completed, else error code */ +static +ulint +row_upd_del_mark_clust_rec( +/*=======================*/ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in/out: rec_get_offsets() for the + record under the cursor */ + que_thr_t* thr, /*!< in: query thread */ + ibool check_ref,/*!< in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /*!< in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + ut_ad(node->is_delete); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Store row because we have to build also the secondary index + entries */ + + row_upd_store_row(node); + + /* Mark the clustered index record deleted; we do not have to check + locks, because we assume that we have an x-lock on the record */ + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err == DB_SUCCESS && check_ref) { + /* NOTE that the following call loses the position of pcur ! */ + + err = row_upd_check_references_constraints(node, + pcur, index->table, + index, offsets, + thr, mtr); + } + + mtr_commit(mtr); + + return(err); +} + +/***********************************************************//** +Updates the clustered index record. +@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT +in case of a lock wait, else error code */ +static +ulint +row_upd_clust_step( +/*===============*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + ibool success; + ibool check_ref; + ulint err; + mtr_t* mtr; + mtr_t mtr_buf; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + index = dict_table_get_first_index(node->table); + + check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr)); + + pcur = node->pcur; + + /* We have to restore the cursor to its position */ + mtr = &mtr_buf; + + mtr_start(mtr); + + /* If the restoration does not succeed, then the same + transaction has deleted the record on which the cursor was, + and that is an SQL error. If the restoration succeeds, it may + still be that the same transaction has successively deleted + and inserted a record with the same ordering fields, but in + that case we know that the transaction has at least an + implicit x-lock on the record. */ + + ut_a(pcur->rel_pos == BTR_PCUR_ON); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + + if (!success) { + err = DB_RECORD_NOT_FOUND; + + mtr_commit(mtr); + + return(err); + } + + /* If this is a row in SYS_INDEXES table of the data dictionary, + then we have to free the file segments of the index tree associated + with the index */ + + if (node->is_delete + && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + + dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, + mtr); + if (!success) { + err = DB_ERROR; + + mtr_commit(mtr); + + return(err); + } + } + + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (!node->has_clust_rec_x_lock) { + err = lock_clust_rec_modify_check_and_lock( + 0, btr_pcur_get_block(pcur), + rec, index, offsets, thr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + goto exit_func; + } + } + + /* NOTE: the following function calls will also commit mtr */ + + if (node->is_delete) { + err = row_upd_del_mark_clust_rec(node, index, offsets, + thr, check_ref, mtr); + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); + } + + /* If the update is made for MySQL, we already have the update vector + ready, else we have to do some evaluation: */ + + if (UNIV_UNLIKELY(!node->in_mysql_interface)) { + /* Copy the necessary columns from clust_rec and calculate the + new values to set */ + row_upd_copy_columns(rec, offsets, + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + err = row_upd_clust_rec(node, index, thr, mtr); + return(err); + } + + row_upd_store_row(node); + + if (row_upd_changes_ord_field_binary(node->row, index, node->update)) { + + /* Update causes an ordering field (ordering fields within + the B-tree) of the clustered index record to change: perform + the update by delete marking and inserting. + + TODO! What to do to the 'Halloween problem', where an update + moves the record forward in index so that it is again + updated when the cursor arrives there? Solution: the + read operation must check the undo record undo number when + choosing records to update. MySQL solves now the problem + externally! */ + + err = row_upd_clust_rec_by_insert(node, index, thr, check_ref, + mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_ALL_SEC; + } else { + err = row_upd_clust_rec(node, index, thr, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_SOME_SEC; + } + + node->index = dict_table_get_next_index(index); + + return(err); +} + +/***********************************************************//** +Updates the affected index records of a row. When the control is transferred +to this node, we assume that we have a persistent cursor which was on a +record, and the position of the cursor is stored in the cursor. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +ulint +row_upd( +/*====*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ulint err = DB_SUCCESS; + + ut_ad(node && thr); + + if (UNIV_LIKELY(node->in_mysql_interface)) { + + /* We do not get the cmpl_info value from the MySQL + interpreter: we must calculate it on the fly: */ + + if (node->is_delete + || row_upd_changes_some_index_ord_field_binary( + node->table, node->update)) { + node->cmpl_info = 0; + } else { + node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; + } + } + + if (node->state == UPD_NODE_UPDATE_CLUSTERED + || node->state == UPD_NODE_INSERT_CLUSTERED) { + + err = row_upd_clust_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + } + + if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + + goto function_exit; + } + + while (node->index != NULL) { + err = row_upd_sec_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->index = dict_table_get_next_index(node->index); + } + +function_exit: + if (err == DB_SUCCESS) { + /* Do some cleanup */ + + if (node->row != NULL) { + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + mem_heap_empty(node->heap); + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + } + + return(err); +} + +/***********************************************************//** +Updates a row in a table. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + upd_node_t* node; + sel_node_t* sel_node; + que_node_t* parent; + ulint err = DB_SUCCESS; + trx_t* trx; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started(trx); + + node = thr->run_node; + + sel_node = node->select; + + parent = que_node_get_parent(node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + if (thr->prev_node == parent) { + node->state = UPD_NODE_SET_IX_LOCK; + } + + if (node->state == UPD_NODE_SET_IX_LOCK) { + + if (!node->has_clust_rec_x_lock) { + /* It may be that the current session has not yet + started its transaction, or it has been committed: */ + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + if (node->searched_update) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to update */ + + thr->run_node = sel_node; + + return(thr); + } + } + + /* sel_node is NULL if we are in the MySQL interface */ + + if (sel_node && (sel_node->state != SEL_NODE_FETCH)) { + + if (!node->searched_update) { + /* An explicit cursor should be positioned on a row + to update */ + + ut_error; + + err = DB_ERROR; + + goto error_handling; + } + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to update, or the select node performed the + updates directly in-place */ + + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_upd(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->searched_update) { + /* Fetch next row to update */ + + thr->run_node = sel_node; + } else { + /* It was an explicit cursor update */ + + thr->run_node = parent; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + return(thr); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c new file mode 100644 index 00000000000..a4fbb5289aa --- /dev/null +++ b/storage/innobase/row/row0vers.c @@ -0,0 +1,741 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0vers.c +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0vers.h" + +#ifdef UNIV_NONINL +#include "row0vers.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "lock0lock.h" + +/*****************************************************************//** +Finds out if an active transaction has inserted or modified a secondary +index record. NOTE: the kernel mutex is temporarily released in this +function! +@return NULL if committed, else the active transaction */ +UNIV_INTERN +trx_t* +row_vers_impl_x_locked_off_kernel( +/*==============================*/ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: the secondary index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + ulint* clust_offsets; + rec_t* version; + trx_id_t trx_id; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + dtuple_t* entry = NULL; /* assignment to eliminate compiler + warning */ + trx_t* trx; + ulint rec_del; + ulint err; + mtr_t mtr; + ulint comp; + + ut_ad(mutex_own(&kernel_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Search for the clustered index record: this is a time-consuming + operation: therefore we release the kernel mutex; also, the release + is required by the latching order convention. The latch on the + clustered index locks the top of the stack of versions. We also + reserve purge_latch to lock the bottom of the version stack. */ + + clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, + &clust_index, &mtr); + if (!clust_rec) { + /* In a rare case it is possible that no clust rec is found + for a secondary index record: if in row0umod.c + row_undo_mod_remove_clust_low() we have already removed the + clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case there cannot be + any implicit lock on the secondary index record, because + an active transaction which has modified the secondary index + record has also modified the clustered index record. And in + a rollback we always undo the modifications to secondary index + records before the clustered index record. */ + + mutex_enter(&kernel_mutex); + mtr_commit(&mtr); + + return(NULL); + } + + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); + + mtr_s_lock(&(purge_sys->latch), &mtr); + + mutex_enter(&kernel_mutex); + + trx = NULL; + if (!trx_is_active(trx_id)) { + /* The transaction that modified or inserted clust_rec is no + longer active: no implicit lock on rec */ + goto exit_func; + } + + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, + clust_offsets, TRUE)) { + /* Corruption noticed: try to avoid a crash by returning */ + goto exit_func; + } + + comp = page_rec_is_comp(rec); + ut_ad(index->table == clust_index->table); + ut_ad(!!comp == dict_table_is_comp(index->table)); + ut_ad(!comp == !page_rec_is_comp(clust_rec)); + + /* We look up if some earlier version, which was modified by the trx_id + transaction, of the clustered index record would require rec to be in + a different state (delete marked or unmarked, or have different field + values, or not existing). If there is such a version, then rec was + modified by the trx_id transaction, and it has an implicit x-lock on + rec. Note that if clust_rec itself would require rec to be in a + different state, then the trx_id transaction has not yet had time to + modify rec, and does not necessarily have an implicit x-lock on rec. */ + + rec_del = rec_get_deleted_flag(rec, comp); + trx = NULL; + + version = clust_rec; + + for (;;) { + rec_t* prev_version; + ulint vers_del; + row_ext_t* ext; + trx_id_t prev_trx_id; + + mutex_exit(&kernel_mutex); + + /* While we retrieve an earlier version of clust_rec, we + release the kernel mutex, because it may take time to access + the disk. After the release, we have to check if the trx_id + transaction is still active. We keep the semaphore in mtr on + the clust_rec page, so that no other transaction can update + it and get an implicit x-lock on rec. */ + + heap2 = heap; + heap = mem_heap_create(1024); + err = trx_undo_prev_version_build(clust_rec, &mtr, version, + clust_index, clust_offsets, + heap, &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (prev_version == NULL) { + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* Transaction no longer active: no + implicit x-lock */ + + break; + } + + /* If the transaction is still active, + clust_rec must be a fresh insert, because no + previous version was found. */ + ut_ad(err == DB_SUCCESS); + + /* It was a freshly inserted version: there is an + implicit x-lock on rec */ + + trx = trx_get_on_id(trx_id); + + break; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + vers_del = rec_get_deleted_flag(prev_version, comp); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); + + /* If the trx_id and prev_trx_id are different and if + the prev_version is marked deleted then the + prev_trx_id must have already committed for the trx_id + to be able to modify the row. Therefore, prev_trx_id + cannot hold any implicit lock. */ + if (vers_del && 0 != ut_dulint_cmp(trx_id, prev_trx_id)) { + + mutex_enter(&kernel_mutex); + break; + } + + /* The stack of versions is locked by mtr. Thus, it + is safe to fetch the prefixes for externally stored + columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, + clust_offsets, NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + /* entry may be NULL if a record was inserted in place + of a deleted record, and the BLOB pointers of the new + record were not initialized yet. But in that case, + prev_version should be NULL. */ + ut_a(entry); + + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* Transaction no longer active: no implicit x-lock */ + + break; + } + + /* If we get here, we know that the trx_id transaction is + still active and it has modified prev_version. Let us check + if prev_version would require rec to be in a different + state. */ + + /* The previous version of clust_rec must be + accessible, because the transaction is still active + and clust_rec was not a fresh insert. */ + ut_ad(err == DB_SUCCESS); + + /* We check if entry and rec are identified in the alphabetical + ordering */ + if (0 == cmp_dtuple_rec(entry, rec, offsets)) { + /* The delete marks of rec and prev_version should be + equal for rec to be in the state required by + prev_version */ + + if (rec_del != vers_del) { + trx = trx_get_on_id(trx_id); + + break; + } + + /* It is possible that the row was updated so that the + secondary index record remained the same in + alphabetical ordering, but the field values changed + still. For example, 'abc' -> 'ABC'. Check also that. */ + + dtuple_set_types_binary(entry, + dtuple_get_n_fields(entry)); + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { + + trx = trx_get_on_id(trx_id); + + break; + } + } else if (!rec_del) { + /* The delete mark should be set in rec for it to be + in the state required by prev_version */ + + trx = trx_get_on_id(trx_id); + + break; + } + + if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { + /* The versions modified by the trx_id transaction end + to prev_version: no implicit x-lock */ + + break; + } + + version = prev_version; + }/* for (;;) */ + +exit_func: + mtr_commit(&mtr); + mem_heap_free(heap); + + return(trx); +} + +/*****************************************************************//** +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. +@return TRUE if earlier version should be preserved */ +UNIV_INTERN +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + trx_id_t trx_id, /*!< in: transaction id in the version */ + mtr_t* mtr) /*!< in: mtr holding the latch on the + clustered index record; it will also + hold the latch on purge_view */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_s_lock(&(purge_sys->latch), mtr); + + if (trx_purge_update_undo_must_exist(trx_id)) { + + /* A purge operation is not yet allowed to remove this + delete marked record */ + + return(TRUE); + } + + return(FALSE); +} + +/*****************************************************************//** +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry and ientry are identified in +the alphabetical ordering; exactly in this case we return TRUE. +@return TRUE if earlier version should have */ +UNIV_INTERN +ibool +row_vers_old_has_index_entry( +/*=========================*/ + ibool also_curr,/*!< in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + const rec_t* rec, /*!< in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the secondary index */ + const dtuple_t* ientry) /*!< in: the secondary index entry */ +{ + const rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + ulint* clust_offsets; + mem_heap_t* heap; + mem_heap_t* heap2; + const dtuple_t* row; + const dtuple_t* entry; + ulint err; + ulint comp; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + mtr_s_lock(&(purge_sys->latch), mtr); + + clust_index = dict_table_get_first_index(index->table); + + comp = page_rec_is_comp(rec); + ut_ad(!dict_table_is_comp(index->table) == !comp); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset BLOB + pointers. This must be a freshly inserted record. If + this is called from + row_purge_remove_sec_if_poss_low(), the thread will + hold latches on the clustered index and the secondary + index. Because the insert works in three steps: + + (1) insert the record to clustered index + (2) store the BLOBs and update BLOB pointers + (3) insert records to secondary indexes + + the purge thread can safely ignore freshly inserted + records and delete the secondary index record. The + thread that inserted the new record will be inserting + the secondary index records. */ + + /* NOTE that we cannot do the comparison as binary + fields because the row is maybe being modified so that + the clustered index record has already been updated to + a different binary value in a char field, but the + collation identifies the old and new value anyway! */ + if (entry && !dtuple_coll_cmp(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = rec; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + err = trx_undo_prev_version_build(rec, mtr, version, + clust_index, clust_offsets, + heap, &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (err != DB_SUCCESS || !prev_version) { + /* Versions end here */ + + mem_heap_free(heap); + + return(FALSE); + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, + NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset + BLOB pointers. This must be a freshly + inserted record that we can safely ignore. + For the justification, see the comments after + the previous row_build_index_entry() call. */ + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (entry && !dtuple_coll_cmp(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = prev_version; + } +} + +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return DB_SUCCESS or DB_MISSING_HISTORY */ +UNIV_INTERN +ulint +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/*!< out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + const rec_t* version; + rec_t* prev_version; + trx_id_t trx_id; + mem_heap_t* heap = NULL; + byte* buf; + ulint err; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + trx_id = row_get_rec_trx_id(rec, index, *offsets); + + ut_ad(!read_view_sees_trx_id(view, trx_id)); + + rw_lock_s_lock(&(purge_sys->latch)); + version = rec; + + for (;;) { + mem_heap_t* heap2 = heap; + trx_undo_rec_t* undo_rec; + roll_ptr_t roll_ptr; + undo_no_t undo_no; + heap = mem_heap_create(1024); + + /* If we have high-granularity consistent read view and + creating transaction of the view is the same as trx_id in + the record we see this record only in the case when + undo_no of the record is < undo_no in the view. */ + + if (view->type == VIEW_HIGH_GRANULARITY + && ut_dulint_cmp(view->creator_trx_id, trx_id) == 0) { + + roll_ptr = row_get_rec_roll_ptr(version, index, + *offsets); + undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + mem_heap_empty(heap); + + if (ut_dulint_cmp(view->undo_no, undo_no) > 0) { + /* The view already sees this version: we can + copy it to in_heap and return */ + + buf = mem_heap_alloc(in_heap, + rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, + *offsets); + err = DB_SUCCESS; + + break; + } + } + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version); + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (err != DB_SUCCESS) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + *offsets = rec_get_offsets(prev_version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + + trx_id = row_get_rec_trx_id(prev_version, index, *offsets); + + if (read_view_sees_trx_id(view, trx_id)) { + + /* The view already sees this version: we can copy + it to in_heap and return */ + + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + err = DB_SUCCESS; + + break; + } + + version = prev_version; + }/* for (;;) */ + + mem_heap_free(heap); + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} + +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. +@return DB_SUCCESS or DB_MISSING_HISTORY */ +UNIV_INTERN +ulint +row_vers_build_for_semi_consistent_read( +/*====================================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers)/*!< out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + const rec_t* version; + mem_heap_t* heap = NULL; + byte* buf; + ulint err; + trx_id_t rec_trx_id = ut_dulint_zero; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + rw_lock_s_lock(&(purge_sys->latch)); + /* The S-latch on purge_sys prevents the purge view from + changing. Thus, if we have an uncommitted transaction at + this point, then purge cannot remove its undo log even if + the transaction could commit now. */ + + version = rec; + + for (;;) { + trx_t* version_trx; + mem_heap_t* heap2; + rec_t* prev_version; + trx_id_t version_trx_id; + + version_trx_id = row_get_rec_trx_id(version, index, *offsets); + if (rec == version) { + rec_trx_id = version_trx_id; + } + + mutex_enter(&kernel_mutex); + version_trx = trx_get_on_id(version_trx_id); + mutex_exit(&kernel_mutex); + + if (!version_trx + || version_trx->conc_state == TRX_NOT_STARTED + || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + + /* We found a version that belongs to a + committed transaction: return it. */ + + if (rec == version) { + *old_vers = rec; + err = DB_SUCCESS; + break; + } + + /* We assume that a rolled-back transaction stays in + TRX_ACTIVE state until all the changes have been + rolled back and the transaction is removed from + the global list of transactions. */ + + if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) { + /* The transaction was committed while + we searched for earlier versions. + Return the current version as a + semi-consistent read. */ + + version = rec; + *offsets = rec_get_offsets(version, + index, *offsets, + ULINT_UNDEFINED, + offset_heap); + } + + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + err = DB_SUCCESS; + + break; + } + + heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version); + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + version = prev_version; + *offsets = rec_get_offsets(version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + }/* for (;;) */ + + if (heap) { + mem_heap_free(heap); + } + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} |