diff options
Diffstat (limited to 'storage/innobase')
33 files changed, 604 insertions, 197 deletions
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 9438277050d..790582815a3 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -1937,7 +1937,7 @@ btr_node_ptr_delete( ut_a(err == DB_SUCCESS); if (!compressed) { - btr_cur_compress_if_useful(&cursor, mtr); + btr_cur_compress_if_useful(&cursor, FALSE, mtr); } } @@ -1945,9 +1945,10 @@ btr_node_ptr_delete( If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. */ static -void +page_t* btr_lift_page_up( /*=============*/ + /* out: father page */ dict_index_t* index, /* in: index tree */ page_t* page, /* in: page which is the only on its level; must not be empty: use @@ -2023,6 +2024,8 @@ btr_lift_page_up( ibuf_reset_free_bits(index, father_page); ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(index, father_page, mtr)); + + return(father_page); } /***************************************************************** @@ -2039,11 +2042,13 @@ enough free extents so that the compression will always succeed if done! */ void btr_compress( /*=========*/ - btr_cur_t* cursor, /* in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr) /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /* in/out: mini-transaction */ { dict_index_t* index; ulint space; @@ -2058,6 +2063,7 @@ btr_compress( rec_t* node_ptr; ulint data_size; ulint n_recs; + ulint nth_rec = 0; /* remove bogus warning */ ulint max_ins_size; ulint max_ins_size_reorg; ulint comp; @@ -2065,6 +2071,7 @@ btr_compress( page = btr_cur_get_page(cursor); index = btr_cur_get_index(cursor); comp = page_is_comp(page); + ut_a((ibool)!!comp == dict_table_is_comp(index->table)); ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), @@ -2086,6 +2093,10 @@ btr_compress( father_page = buf_frame_align(node_ptr); ut_a(comp == page_is_comp(father_page)); + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + } + /* Decide the page to which we try to merge and which will inherit the locks */ @@ -2110,9 +2121,8 @@ btr_compress( } else { /* The page is the only one on the level, lift the records to the father */ - btr_lift_page_up(index, page, mtr); - - return; + merge_page = btr_lift_page_up(index, page, mtr); + goto func_exit; } n_recs = page_get_n_recs(page); @@ -2188,6 +2198,10 @@ btr_compress( index, mtr); lock_update_merge_left(merge_page, orig_pred, page); + + if (adjust) { + nth_rec += page_rec_get_n_recs_before(orig_pred); + } } else { orig_succ = page_rec_get_next( page_get_infimum_rec(merge_page)); @@ -2208,6 +2222,12 @@ btr_compress( btr_page_free(index, page, mtr); ut_ad(btr_check_node_ptr(index, merge_page, mtr)); + +func_exit: + if (adjust) { + btr_cur_position(index, page_rec_get_nth(merge_page, nth_rec), + cursor); + } } /***************************************************************** diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 6c0497cbd41..9ce09929f9a 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -31,6 +31,7 @@ Created 10/16/1994 Heikki Tuuri #include "btr0sea.h" #include "row0upd.h" #include "trx0rec.h" +#include "trx0roll.h" /* trx_roll_crash_recv_trx */ #include "que0que.h" #include "row0row.h" #include "srv0srv.h" @@ -73,6 +74,13 @@ this many index pages */ + not_empty) \ / (BTR_KEY_VAL_ESTIMATE_N_PAGES + ext_size)) +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/* A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /*********************************************************************** Marks all extern fields in a record as owned by the record. This function should be called if the delete mark of a record is removed: a not delete @@ -1572,7 +1580,6 @@ btr_cur_optimistic_update( ulint old_rec_size; dtuple_t* new_entry; dulint roll_ptr; - trx_t* trx; mem_heap_t* heap; ibool reorganized = FALSE; ulint i; @@ -1585,6 +1592,10 @@ btr_cur_optimistic_update( heap = mem_heap_create(1024); offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets) + || thr_get_trx(thr) == trx_roll_crash_recv_trx); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { @@ -1691,13 +1702,11 @@ btr_cur_optimistic_update( page_cur_move_to_prev(page_cursor); - trx = thr_get_trx(thr); - if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, roll_ptr); row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx->id); + thr_get_trx(thr)->id); } rec = btr_cur_insert_if_possible(cursor, new_entry, &reorganized, mtr); @@ -1781,7 +1790,9 @@ btr_cur_pessimistic_update( /* out: DB_SUCCESS or error code */ ulint flags, /* in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /* in: cursor on the record to update */ + btr_cur_t* cursor, /* in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ big_rec_t** big_rec,/* out: big rec vector whose fields have to be stored externally by the caller, or NULL */ upd_t* update, /* in: update vector; this is allowed also @@ -1916,6 +1927,10 @@ btr_cur_pessimistic_update( err = DB_TOO_BIG_RECORD; goto return_after_reservations; } + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(btr_page_get_level(page, mtr) == 0); + ut_ad(flags & BTR_KEEP_POS_FLAG); } page_cursor = btr_cur_get_page_cur(cursor); @@ -1942,6 +1957,8 @@ btr_cur_pessimistic_update( ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { + page_cursor->rec = rec; + lock_rec_restore_from_page_infimum(rec, page); rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); @@ -1955,12 +1972,30 @@ btr_cur_pessimistic_update( btr_cur_unmark_extern_fields(rec, mtr, offsets); } - btr_cur_compress_if_useful(cursor, mtr); + btr_cur_compress_if_useful( + cursor, + big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG), + mtr); err = DB_SUCCESS; goto return_after_reservations; } + if (big_rec_vec) { + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(btr_page_get_level(page, mtr) == 0); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + mtr_x_lock(dict_index_get_lock(index), mtr); + } + if (page_cur_is_before_first(page_cursor)) { /* The record to be updated was positioned as the first user record on its page */ @@ -1981,6 +2016,7 @@ btr_cur_pessimistic_update( ut_a(rec); ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); + page_cursor->rec = rec; rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); @@ -2015,6 +2051,43 @@ return_after_reservations: return(err); } +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ + +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr) /* in/out: mini-transaction */ +{ + buf_block_t* block; + + block = buf_block_align(btr_cur_get_rec(cursor)); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* Keep the locks across the mtr_commit(mtr). */ + rw_lock_x_lock(dict_index_get_lock(cursor->index)); + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); +#else + buf_block_buf_fix_inc(block); +#endif + mutex_exit(&block->mutex); + /* Write out the redo log. */ + mtr_commit(mtr); + mtr_start(mtr); + /* Reassociate the locks with the mini-transaction. + They will be released on mtr_commit(mtr). */ + mtr_memo_push(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); +} + /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /******************************************************************** @@ -2383,30 +2456,6 @@ btr_cur_del_unmark_for_ibuf( /*==================== B-TREE RECORD REMOVE =========================*/ /***************************************************************** -Tries to compress a page of the tree on the leaf level. It is assumed -that mtr holds an x-latch on the tree and on the cursor page. To avoid -deadlocks, mtr must also own x-latches to brothers of page, if those -brothers exist. NOTE: it is assumed that the caller has reserved enough -free extents so that the compression will always succeed if done! */ - -void -btr_cur_compress( -/*=============*/ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid */ - mtr_t* mtr) /* in: mtr */ -{ - ut_ad(mtr_memo_contains(mtr, - dict_index_get_lock(btr_cur_get_index(cursor)), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_rec(cursor)), - MTR_MEMO_PAGE_X_FIX)); - ut_ad(btr_page_get_level(btr_cur_get_page(cursor), mtr) == 0); - - btr_compress(cursor, mtr); -} - -/***************************************************************** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid deadlocks, mtr must also own x-latches to brothers of page, if those @@ -2417,10 +2466,12 @@ ibool btr_cur_compress_if_useful( /*=======================*/ /* out: TRUE if compression occurred */ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid if compression - occurs */ - mtr_t* mtr) /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /* in/out: mini-transaction */ { ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(btr_cur_get_index(cursor)), @@ -2430,7 +2481,7 @@ btr_cur_compress_if_useful( if (btr_cur_compress_recommendation(cursor, mtr)) { - btr_compress(cursor, mtr); + btr_compress(cursor, adjust, mtr); return(TRUE); } @@ -2643,7 +2694,7 @@ return_after_reservations: mem_heap_free(heap); if (ret == FALSE) { - ret = btr_cur_compress_if_useful(cursor, mtr); + ret = btr_cur_compress_if_useful(cursor, FALSE, mtr); } if (n_extents > 0) { diff --git a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.c index c505bfbd6c4..7e820cfb08d 100644 --- a/storage/innobase/dict/dict0load.c +++ b/storage/innobase/dict/dict0load.c @@ -454,9 +454,11 @@ dict_load_report_deleted_index( /************************************************************************ Loads definitions for index fields. */ static -void +ulint dict_load_fields( /*=============*/ + /* out: DB_SUCCESS if ok, DB_CORRUPTION + if failed */ dict_table_t* table, /* in: table */ dict_index_t* index, /* in: index whose fields to load */ mem_heap_t* heap) /* in: memory heap for temporary storage */ @@ -474,6 +476,7 @@ dict_load_fields( byte* buf; ulint i; mtr_t mtr; + ulint error = DB_SUCCESS; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -535,6 +538,26 @@ dict_load_fields( field = rec_get_nth_field_old(rec, 4, &len); + if (prefix_len >= DICT_MAX_INDEX_COL_LEN) { + fprintf(stderr, "InnoDB: Error: load index" + " '%s' failed.\n" + "InnoDB: index field '%s' has a prefix" + " length of %lu bytes,\n" + "InnoDB: which exceeds the" + " maximum limit of %lu bytes.\n" + "InnoDB: Please use server that" + " supports long index prefix\n" + "InnoDB: or turn on" + " innodb_force_recovery to load" + " the table\n", + index->name, mem_heap_strdupl( + heap, (char*) field, len), + (ulong) prefix_len, + (ulong) (DICT_MAX_INDEX_COL_LEN - 1)); + error = DB_CORRUPTION; + goto func_exit; + } + dict_mem_index_add_field(index, mem_heap_strdupl(heap, (char*) field, len), @@ -543,8 +566,10 @@ dict_load_fields( btr_pcur_move_to_next_user_rec(&pcur, &mtr); } +func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); + return(error); } /************************************************************************ @@ -701,10 +726,28 @@ dict_load_indexes( space, type, n_fields); index->id = id; - dict_load_fields(table, index, heap); + error = dict_load_fields(table, index, heap); + + if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: Error: load index '%s'" + " for table '%s' failed\n", + index->name, table->name); + + /* If the force recovery flag is set, and + if the failed index is not the primary index, we + will continue and open other indexes */ + if (srv_force_recovery + && !(index->type & DICT_CLUSTERED)) { + error = DB_SUCCESS; + goto next_rec; + } else { + goto func_exit; + } + } + dict_index_add_to_cache(table, index, page_no); } - +next_rec: btr_pcur_move_to_next_user_rec(&pcur, &mtr); } @@ -881,9 +924,18 @@ err_exit: } else { table->fk_max_recusive_level = 0; } - } else if (!srv_force_recovery) { - dict_table_remove_from_cache(table); - table = NULL; + } else { + dict_index_t* index; + + /* Make sure that at least the clustered index was loaded. + Otherwise refuse to load the table */ + index = dict_table_get_first_index(table); + + if (!srv_force_recovery || !index + || !(index->type & DICT_CLUSTERED)) { + dict_table_remove_from_cache(table); + table = NULL; + } } #if 0 if (err != DB_SUCCESS && table != NULL) { diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 6f58fd70fbd..dfe13ccbbfe 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -189,7 +189,7 @@ innobase_index_name_is_reserved( /*============================*/ /* out: true if index name matches a reserved name */ - const trx_t* trx, /* in: InnoDB transaction handle */ + THD* thd, /* in/out: MySQL connection */ const TABLE* form, /* in: information on table columns and indexes */ const char* norm_name); /* in: table name */ @@ -5285,10 +5285,6 @@ create_table_def( DBUG_PRINT("enter", ("table_name: %s", table_name)); ut_a(trx->mysql_thd != NULL); - if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name, - (THD*) trx->mysql_thd)) { - DBUG_RETURN(HA_ERR_GENERIC); - } n_cols = form->s->fields; @@ -5397,6 +5393,8 @@ err_col: col_len); } + srv_lower_case_table_names = lower_case_table_names; + error = row_create_table_for_mysql(table, trx); innodb_check_for_record_too_big_error(flags & DICT_TF_COMPACT, error); @@ -5642,6 +5640,35 @@ ha_innobase::create( DBUG_RETURN(HA_ERR_TO_BIG_ROW); } + strcpy(name2, name); + + normalize_table_name(norm_name, name2); + + /* Create the table definition in InnoDB */ + + flags = form->s->row_type != ROW_TYPE_REDUNDANT ? DICT_TF_COMPACT : 0; + + /* Look for a primary key */ + + primary_key_no= (form->s->primary_key != MAX_KEY ? + (int) form->s->primary_key : + -1); + + /* Our function row_get_mysql_key_number_for_index assumes + the primary key is always number 0, if it exists */ + + DBUG_ASSERT(primary_key_no == -1 || primary_key_no == 0); + + /* Check for name conflicts (with reserved name) for + any user indices to be created. */ + if (innobase_index_name_is_reserved(thd, form, norm_name)) { + DBUG_RETURN(-1); + } + + if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) { + DBUG_RETURN(HA_ERR_GENERIC); + } + /* Get the transaction associated with the current thd, or create one if not yet created */ @@ -5665,48 +5692,12 @@ ha_innobase::create( trx->check_unique_secondary = FALSE; } - if (lower_case_table_names) { - srv_lower_case_table_names = TRUE; - } else { - srv_lower_case_table_names = FALSE; - } - - strcpy(name2, name); - - normalize_table_name(norm_name, name2); - /* Latch the InnoDB data dictionary exclusively so that no deadlocks or lock waits can happen in it during a table create operation. Drop table etc. do this latching in row0mysql.c. */ row_mysql_lock_data_dictionary(trx); - /* Create the table definition in InnoDB */ - - flags = 0; - - if (form->s->row_type != ROW_TYPE_REDUNDANT) { - flags |= DICT_TF_COMPACT; - } - - /* Look for a primary key */ - - primary_key_no= (form->s->primary_key != MAX_KEY ? - (int) form->s->primary_key : - -1); - - /* Our function row_get_mysql_key_number_for_index assumes - the primary key is always number 0, if it exists */ - - DBUG_ASSERT(primary_key_no == -1 || primary_key_no == 0); - - /* Check for name conflicts (with reserved name) for - any user indices to be created. */ - if (innobase_index_name_is_reserved(trx, form, norm_name)) { - error = -1; - goto cleanup; - } - error = create_table_def(trx, form, norm_name, create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL, flags); @@ -5936,12 +5927,6 @@ ha_innobase::delete_table( trx_search_latch_release_if_reserved(parent_trx); - if (lower_case_table_names) { - srv_lower_case_table_names = TRUE; - } else { - srv_lower_case_table_names = FALSE; - } - trx = trx_allocate_for_mysql(); trx->mysql_thd = thd; @@ -5961,6 +5946,8 @@ ha_innobase::delete_table( /* Drop the table in InnoDB */ + srv_lower_case_table_names = lower_case_table_names; + error = row_drop_table_for_mysql(norm_name, trx, thd_sql_command(thd) == SQLCOM_DROP_DB); @@ -6089,12 +6076,6 @@ ha_innobase::rename_table( trx_search_latch_release_if_reserved(parent_trx); - if (lower_case_table_names) { - srv_lower_case_table_names = TRUE; - } else { - srv_lower_case_table_names = FALSE; - } - trx = trx_allocate_for_mysql(); trx->mysql_thd = thd; INNOBASE_COPY_STMT(thd, trx); @@ -6114,6 +6095,8 @@ ha_innobase::rename_table( /* Rename the table in InnoDB */ + srv_lower_case_table_names = lower_case_table_names; + error = row_rename_table_for_mysql(norm_from, norm_to, trx); /* Flush the log to reduce probability that the .frm files and @@ -7342,10 +7325,18 @@ ha_innobase::external_lock( reset_template(prebuilt); - if (lock_type == F_WRLCK) { + if (lock_type == F_WRLCK + || (table->s->tmp_table + && thd_sql_command(thd) == SQLCOM_LOCK_TABLES)) { /* If this is a SELECT, then it is in UPDATE TABLE ... - or SELECT ... FOR UPDATE */ + or SELECT ... FOR UPDATE + + For temporary tables which are locked for READ by LOCK TABLES + updates are still allowed by SQL-layer. In order to accomodate + for such a situation we always request X-lock for such table + at LOCK TABLES time. + */ prebuilt->select_lock_type = LOCK_X; prebuilt->stored_select_lock_type = LOCK_X; } @@ -8565,7 +8556,7 @@ innobase_commit_by_xid( if (trx) { innobase_commit_low(trx); - + trx_free_for_background(trx); return(XA_OK); } else { return(XAER_NOTA); @@ -8588,7 +8579,9 @@ innobase_rollback_by_xid( trx = trx_get_trx_by_xid(xid); if (trx) { - return(innobase_rollback_trx(trx)); + int ret = innobase_rollback_trx(trx); + trx_free_for_background(trx); + return(ret); } else { return(XAER_NOTA); } @@ -8824,7 +8817,7 @@ innobase_index_name_is_reserved( /*============================*/ /* out: true if an index name matches the reserved name */ - const trx_t* trx, /* in: InnoDB transaction handle */ + THD* thd, /* in/out: MySQL connection */ const TABLE* form, /* in: information on table columns and indexes */ const char* norm_name) /* in: table name */ @@ -8838,7 +8831,7 @@ innobase_index_name_is_reserved( if (innobase_strcasecmp(key->name, innobase_index_reserve_name) == 0) { /* Push warning to mysql */ - push_warning_printf((THD*) trx->mysql_thd, + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_CANT_CREATE_TABLE, "Cannot Create Index with name " diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 1573de7e818..269fa355558 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -312,11 +312,13 @@ enough free extents so that the compression will always succeed if done! */ void btr_compress( /*=========*/ - btr_cur_t* cursor, /* in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr); /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr); /* in/out: mini-transaction */ /***************************************************************** Discards a page from a B-tree. This is used to remove the last record from a B-tree page: the whole page must be removed at the same time. This cannot diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 20235c55f22..c068d8d3318 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -23,6 +23,9 @@ Created 10/16/1994 Heikki Tuuri #define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */ #define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the update vector or inserted entry */ +#define BTR_KEEP_POS_FLAG 8 /* btr_cur_pessimistic_update() + must keep cursor position when + moving columns to big_rec */ #define BTR_CUR_ADAPT #define BTR_CUR_HASH_ADAPT @@ -237,7 +240,9 @@ btr_cur_pessimistic_update( /* out: DB_SUCCESS or error code */ ulint flags, /* in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /* in: cursor on the record to update */ + btr_cur_t* cursor, /* in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ big_rec_t** big_rec,/* out: big rec vector whose fields have to be stored externally by the caller, or NULL */ upd_t* update, /* in: update vector; this is allowed also @@ -247,6 +252,15 @@ btr_cur_pessimistic_update( updates */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ + +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr); /* in/out: mini-transaction */ /*************************************************************** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -286,19 +300,6 @@ btr_cur_del_unmark_for_ibuf( rec_t* rec, /* in: record to delete unmark */ mtr_t* mtr); /* in: mtr */ /***************************************************************** -Tries to compress a page of the tree on the leaf level. It is assumed -that mtr holds an x-latch on the tree and on the cursor page. To avoid -deadlocks, mtr must also own x-latches to brothers of page, if those -brothers exist. NOTE: it is assumed that the caller has reserved enough -free extents so that the compression will always succeed if done! */ - -void -btr_cur_compress( -/*=============*/ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid */ - mtr_t* mtr); /* in: mtr */ -/***************************************************************** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid deadlocks, mtr must also own x-latches to brothers of page, if those @@ -309,10 +310,12 @@ ibool btr_cur_compress_if_useful( /*=======================*/ /* out: TRUE if compression occurred */ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid if compression - occurs */ - mtr_t* mtr); /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr); /* in/out: mini-transaction */ /*********************************************************** Removes the record on which the tree cursor is positioned. It is assumed that the mtr has an x-latch on the page where the cursor is positioned, diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index 8fa0bf0602d..eaa1f36e781 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -18,4 +18,9 @@ typedef struct btr_pcur_struct btr_pcur_t; typedef struct btr_cur_struct btr_cur_t; typedef struct btr_search_struct btr_search_t; +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#define BTR_EXTERN_FIELD_REF_SIZE 20 +extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + #endif diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 3e8972d9182..7479ce9cbf0 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -682,6 +682,25 @@ buf_page_address_fold( /* out: the folded value */ ulint space, /* in: space id */ ulint offset);/* in: offset of the page within space */ +#ifdef UNIV_SYNC_DEBUG +/*********************************************************************** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_debug( +/*========================*/ + buf_block_t* block, /* in: block to bufferfix */ + const char* file __attribute__ ((unused)), /* in: file name */ + ulint line __attribute__ ((unused))); /* in: line */ +#else /* UNIV_SYNC_DEBUG */ +/*********************************************************************** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc( +/*==================*/ + buf_block_t* block); /* in: block to bufferfix */ +#endif /* UNIV_SYNC_DEBUG */ /********************************************************************** Returns the control block of a file page, NULL if not found. */ UNIV_INLINE diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 58c5fd9ef3d..f4d3619f73f 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -660,6 +660,6 @@ buf_page_dbg_add_level( ulint level __attribute__((unused))) /* in: latching order level */ { - sync_thread_add_level(&(buf_block_align(frame)->lock), level); + sync_thread_add_level(&(buf_block_align(frame)->lock), level, FALSE); } #endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 273007c2778..24698557e77 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -234,11 +234,22 @@ page_get_supremum_rec( /*==================*/ /* out: the last record in record list */ page_t* page); /* in: page which must have record(s) */ -/**************************************************************** -Returns the middle record of record list. If there are an even number -of records in the list, returns the first record of upper half-list. */ +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). */ rec_t* +page_rec_get_nth( +/*=============*/ + /* out: nth record */ + page_t* page, /* in: page */ + ulint nth); /* in: nth record */ +/***************************************************************** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. */ +UNIV_INLINE +rec_t* page_get_middle_rec( /*================*/ /* out: middle record */ @@ -280,7 +291,8 @@ page_get_n_recs( page_t* page); /* in: index page */ /******************************************************************* Returns the number of records before the given record in chain. -The number includes infimum and supremum records. */ +The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). */ ulint page_rec_get_n_recs_before( diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index d9e67f3eeeb..a019aa28515 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -341,6 +341,22 @@ page_rec_is_infimum( } /***************************************************************** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + /* out: middle record */ + page_t* page) /* in: page */ +{ + ulint middle = (page_get_n_recs(page) + 2) / 2; + + return(page_rec_get_nth(page, middle)); +} + +/***************************************************************** Compares a data tuple to a physical record. Differs from the function cmp_dtuple_rec_with_match in the way that the record must reside on an index page, and also page infimum and supremum records can be given in diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 58762fc3111..67baeb7d8d2 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -339,6 +339,19 @@ rec_offs_any_extern( /*================*/ /* out: TRUE if a field is stored externally */ const ulint* offsets);/* in: array returned by rec_get_offsets() */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************** +Determine if the offsets are for a record containing null BLOB pointers. */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + /* out: first field containing + a null BLOB pointer, + or NULL if none found */ + rec_t* rec, /*!< in: record */ + const ulint* offsets); /*!< in: rec_get_offsets(rec) */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /*************************************************************** Sets the value of the ith field extern storage bit. */ UNIV_INLINE diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index df66bb13aeb..566c62e30f2 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -9,6 +9,7 @@ Created 5/30/1994 Heikki Tuuri #include "mach0data.h" #include "ut0byte.h" #include "dict0dict.h" +#include "btr0types.h" /* Compact flag ORed to the extra size returned by rec_get_offsets() */ #define REC_OFFS_COMPACT ((ulint) 1 << 31) @@ -1020,6 +1021,42 @@ rec_offs_any_extern( return(FALSE); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************** +Determine if the offsets are for a record containing null BLOB pointers. */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + /* out: first field containing + a null BLOB pointer, + or NULL if none found */ + rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ +{ + ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field + = rec_get_nth_field(rec, offsets, i, &len); + + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + if (!memcmp(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + return(field); + } + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /*************************************************************** Sets the value of the ith field extern storage bit. */ UNIV_INLINE diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h index fae26b7a63e..ec48059dbcb 100644 --- a/storage/innobase/include/sync0arr.h +++ b/storage/innobase/include/sync0arr.h @@ -93,10 +93,13 @@ sync_arr_wake_threads_if_sema_free(void); Prints warnings of long semaphore waits to stderr. */ ibool -sync_array_print_long_waits(void); -/*=============================*/ - /* out: TRUE if fatal semaphore wait threshold - was exceeded */ +sync_array_print_long_waits( +/*========================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ + os_thread_id_t* waiter, /* out: longest waiting thread */ + const void** sema) /* out: longest-waited-for semaphore */ + __attribute__((nonnull)); /************************************************************************ Validates the integrity of the wait array. Checks that the number of reserved cells equals the count variable. */ diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 9430d4cb723..595dca0da6d 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -198,8 +198,9 @@ void sync_thread_add_level( /*==================*/ void* latch, /* in: pointer to a mutex or an rw-lock */ - ulint level); /* in: level in the latching order; if + ulint level, /* in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock);/* in: TRUE if re-entering an x-lock */ /********************************************************************** Removes a latch from the thread level array if it is found there. */ diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index c1eca3d5753..4fabb83b025 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -15,6 +15,9 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" +/* In crash recovery, the current trx to be rolled back */ +extern trx_t* trx_roll_crash_recv_trx; + #define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL) /*********************************************************************** diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index bad3c9d570c..7ea981eb85c 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -256,6 +256,16 @@ trx_in_trx_list( /*============*/ /* out: TRUE if is in */ trx_t* in_trx);/* in: trx */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************** +Assert that a transaction is active. */ +UNIV_INLINE +ibool +trx_assert_active( +/*==============*/ + /* out: TRUE */ + dulint trx_id); /* in: transaction identifier */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /********************************************************************* Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic index 1142fb60398..f5033c5778a 100644 --- a/storage/innobase/include/trx0sys.ic +++ b/storage/innobase/include/trx0sys.ic @@ -257,6 +257,27 @@ trx_get_on_id( return(NULL); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************** +Assert that a transaction is active. */ +UNIV_INLINE +ibool +trx_assert_active( +/*==============*/ + /* out: TRUE */ + dulint trx_id) /* in: transaction identifier */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + trx = trx_get_on_id(trx_id); + ut_a(trx); + mutex_exit(&kernel_mutex); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /******************************************************************** Returns the minumum trx id in trx list. This is the smallest id for which the trx can possibly be active. (But, you must look at the trx->conc_state to diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 4652f45892e..7cb16107746 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -19,7 +19,12 @@ Created 3/26/1996 Heikki Tuuri #include "dict0types.h" #include "trx0xa.h" +/* Number of transactions currently allocated for MySQL: protected by +the kernel mutex */ extern ulint trx_n_mysql_transactions; +/* Number of transactions currently in the XA PREPARED state: protected by +the kernel mutex */ +extern ulint trx_n_prepared; /************************************************************************ Releases the search latch if trx has reserved it. */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index ce5d8a092bf..a67b1b3895e 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -88,6 +88,8 @@ memory is read outside the allocated blocks. */ #if 0 #define UNIV_DEBUG_VALGRIND /* Enable extra Valgrind instrumentation */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ #define UNIV_DEBUG /* Enable ut_ad() assertions */ #define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ #define UNIV_MEM_DEBUG /* detect memory leaks etc */ diff --git a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c index 3300997112b..092e3bfe37f 100644 --- a/storage/innobase/log/log0log.c +++ b/storage/innobase/log/log0log.c @@ -3052,12 +3052,13 @@ loop: goto loop; } - /* Check that there are no longer transactions. We need this wait even - for the 'very fast' shutdown, because the InnoDB layer may have - committed or prepared transactions and we don't want to lose them. */ + /* Check that there are no longer transactions, except for + PREPARED ones. We need this wait even for the 'very fast' + shutdown, because the InnoDB layer may have committed or + prepared transactions and we don't want to lose them. */ if (trx_n_mysql_transactions > 0 - || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { + || UT_LIST_GET_LEN(trx_sys->trx_list) > trx_n_prepared) { mutex_exit(&kernel_mutex); diff --git a/storage/innobase/page/page0page.c b/storage/innobase/page/page0page.c index 543cf9e34eb..6a89df7de22 100644 --- a/storage/innobase/page/page0page.c +++ b/storage/innobase/page/page0page.c @@ -1194,49 +1194,42 @@ page_dir_balance_slot( } /**************************************************************** -Returns the middle record of the record list. If there are an even number -of records in the list, returns the first record of the upper half-list. */ +Returns the nth record of the record list. */ rec_t* -page_get_middle_rec( -/*================*/ - /* out: middle record */ - page_t* page) /* in: page */ +page_rec_get_nth( +/*=============*/ + /* out: nth record */ + page_t* page, /* in: page */ + ulint nth) /* in: nth record */ { page_dir_slot_t* slot; - ulint middle; ulint i; ulint n_owned; - ulint count; rec_t* rec; - /* This many records we must leave behind */ - middle = (page_get_n_recs(page) + 2) / 2; - - count = 0; + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); for (i = 0;; i++) { slot = page_dir_get_nth_slot(page, i); n_owned = page_dir_slot_get_n_owned(slot); - if (count + n_owned > middle) { + if (n_owned > nth) { break; } else { - count += n_owned; + nth -= n_owned; } } ut_ad(i > 0); slot = page_dir_get_nth_slot(page, i - 1); rec = page_dir_slot_get_rec(slot); - rec = page_rec_get_next(rec); - - /* There are now count records behind rec */ - for (i = 0; i < middle - count; i++) { + do { rec = page_rec_get_next(rec); - } + ut_ad(rec); + } while (nth--); return(rec); } diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c index 9786f90fd39..7ff443a11ad 100644 --- a/storage/innobase/row/row0ins.c +++ b/storage/innobase/row/row0ins.c @@ -259,6 +259,7 @@ row_ins_sec_index_entry_by_modify( err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, &dummy_big_rec, update, 0, thr, mtr); + ut_a(!dummy_big_rec); } func_exit: mem_heap_free(heap); @@ -329,8 +330,9 @@ row_ins_clust_index_entry_by_modify( goto func_exit; } - err = btr_cur_pessimistic_update(0, cursor, big_rec, update, - 0, thr, mtr); + err = btr_cur_pessimistic_update( + BTR_KEEP_POS_FLAG, cursor, big_rec, update, + 0, thr, mtr); } func_exit: mem_heap_free(heap); @@ -2083,6 +2085,41 @@ row_ins_index_entry_low( err = row_ins_clust_index_entry_by_modify( mode, &cursor, &big_rec, entry, ext_vec, n_ext_vec, thr, &mtr); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. We have + to mtr_commit(mtr) first, so that the + redo log will be written in the + correct order. Otherwise, we would run + into trouble on crash recovery if mtr + freed B-tree pages on which some of + the big_rec fields will be written. */ + btr_cur_mtr_commit_and_start(&cursor, &mtr); + + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, + &heap); + + err = btr_store_big_rec_extern_fields( + index, rec, offsets, big_rec, &mtr); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. */ + ut_a(err == DB_SUCCESS); + goto stored_big_rec; + } } else { err = row_ins_sec_index_entry_by_modify( mode, &cursor, entry, thr, &mtr); @@ -2119,7 +2156,6 @@ function_exit: mtr_commit(&mtr); if (big_rec) { - rec_t* rec; mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, @@ -2130,7 +2166,7 @@ function_exit: err = btr_store_big_rec_extern_fields(index, rec, offsets, big_rec, &mtr); - +stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); } else { diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index 08e50817db9..171039e34ac 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -202,6 +202,7 @@ row_build( ut_ad(index && rec && heap); ut_ad(index->type & DICT_CLUSTERED); + ut_ad(!mutex_own(&kernel_mutex)); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, @@ -210,6 +211,26 @@ row_build( ut_ad(rec_offs_validate(rec, index, offsets)); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* This condition can occur during crash recovery before + trx_rollback_or_clean_all_without_sess() has completed + execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it is active. (In this version of InnoDB, we + cannot assert that it was recovered, because there is no + trx->is_recovered field.) */ + + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_assert_active(row_get_rec_trx_id(rec, index, offsets))); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ buf = mem_heap_alloc(heap, rec_offs_size(offsets)); @@ -302,6 +323,10 @@ row_rec_to_index_entry( rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, offsets); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + } else { + ut_a(!rec_offs_any_null_extern(rec, offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ } rec_len = rec_offs_n_fields(offsets); diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c index a3333fcc536..0b00aa2411a 100644 --- a/storage/innobase/row/row0umod.c +++ b/storage/innobase/row/row0umod.c @@ -119,6 +119,7 @@ row_undo_mod_clust_low( | BTR_KEEP_SYS_FLAG, btr_cur, &dummy_big_rec, node->update, node->cmpl_info, thr, mtr); + ut_ad(!dummy_big_rec); } return(err); @@ -471,6 +472,7 @@ row_undo_mod_del_unmark_sec_and_undo_update( BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, btr_cur, &dummy_big_rec, update, 0, thr, &mtr); + ut_ad(!dummy_big_rec); } mem_heap_free(heap); diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c index 0790cfe02e2..694b00ea265 100644 --- a/storage/innobase/row/row0upd.c +++ b/storage/innobase/row/row0upd.c @@ -1580,32 +1580,48 @@ row_upd_clust_rec( ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), dict_table_is_comp(index->table))); - err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, - &big_rec, node->update, - node->cmpl_info, thr, mtr); - mtr_commit(mtr); + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, + &big_rec, node->update, node->cmpl_info, thr, mtr); - if (err == DB_SUCCESS && big_rec) { + if (big_rec) { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_t* rec; *offsets_ = (sizeof offsets_) / sizeof *offsets_; - mtr_start(mtr); + ut_a(err == DB_SUCCESS); + /* Write out the externally stored columns while still + x-latching index->lock and block->lock. We have to + mtr_commit(mtr) first, so that the redo log will be + written in the correct order. Otherwise, we would run + into trouble on crash recovery if mtr freed B-tree + pages on which some of the big_rec fields will be + written. */ + btr_cur_mtr_commit_and_start(btr_cur, mtr); - ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr); + big_rec, mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - mtr_commit(mtr); + /* If writing big_rec fails (for example, because of + DB_OUT_OF_FILE_SPACE), the record will be corrupted. + Even if we did not update any externally stored + columns, our update could cause the record to grow so + that a non-updated column was selected for external + storage. This non-update would not have been written + to the undo log, and thus the record cannot be rolled + back. */ + ut_a(err == DB_SUCCESS); } + mtr_commit(mtr); + if (big_rec) { dtuple_big_rec_free(big_rec); } diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c index 23aca8c3f2e..906b46fb51b 100644 --- a/storage/innobase/row/row0vers.c +++ b/storage/innobase/row/row0vers.c @@ -473,6 +473,11 @@ row_vers_build_for_consistent_read( /* The view already sees this version: we can copy it to in_heap and return */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern( + version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); *old_vers = rec_copy(buf, version, *offsets); @@ -506,6 +511,10 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + trx_id = row_get_rec_trx_id(prev_version, index, *offsets); if (read_view_sees_trx_id(view, trx_id)) { @@ -606,6 +615,10 @@ row_vers_build_for_semi_consistent_read( /* We found a version that belongs to a committed transaction: return it. */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + if (rec == version) { *old_vers = rec; err = DB_SUCCESS; @@ -663,6 +676,9 @@ row_vers_build_for_semi_consistent_read( version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ }/* for (;;) */ if (heap) { diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index 9c34e73109c..3f6f1982992 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -2180,9 +2180,15 @@ srv_error_monitor_thread( os_thread_create */ { /* number of successive fatal timeouts observed */ - ulint fatal_cnt = 0; - dulint old_lsn; - dulint new_lsn; + ulint fatal_cnt = 0; + dulint old_lsn; + dulint new_lsn; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; old_lsn = srv_start_lsn; @@ -2224,10 +2230,11 @@ loop: /* In case mutex_exit is not a memory barrier, it is theoretically possible some threads are left waiting though the semaphore is already released. Wake up those threads: */ - + sync_arr_wake_threads_if_sema_free(); - if (sync_array_print_long_waits()) { + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { fatal_cnt++; if (fatal_cnt > 10) { @@ -2242,6 +2249,8 @@ loop: } } else { fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; } /* Flush stderr so that a database user gets the output diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c index 41d3492c8c9..93a7398f252 100644 --- a/storage/innobase/sync/sync0arr.c +++ b/storage/innobase/sync/sync0arr.c @@ -916,10 +916,12 @@ sync_arr_wake_threads_if_sema_free(void) Prints warnings of long semaphore waits to stderr. */ ibool -sync_array_print_long_waits(void) -/*=============================*/ - /* out: TRUE if fatal semaphore wait threshold - was exceeded */ +sync_array_print_long_waits( +/*========================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ + os_thread_id_t* waiter, /* out: longest waiting thread */ + const void** sema) /* out: longest-waited-for semaphore */ { sync_cell_t* cell; ibool old_val; @@ -927,24 +929,40 @@ sync_array_print_long_waits(void) ulint i; ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; ibool fatal = FALSE; + double longest_diff = 0; for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + double diff; + void* wait_object; + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) > 240) { + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); sync_array_cell_print(stderr, cell); noticed = TRUE; } - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) - > fatal_timeout) { + if (diff > fatal_timeout) { fatal = TRUE; } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } } if (noticed) { diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c index ef4c07e8c26..089e87a8a5c 100644 --- a/storage/innobase/sync/sync0rw.c +++ b/storage/innobase/sync/sync0rw.c @@ -663,7 +663,9 @@ rw_lock_add_debug_info( rw_lock_debug_mutex_exit(); if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { - sync_thread_add_level(lock, lock->level); + sync_thread_add_level(lock, lock->level, + lock_type == RW_LOCK_EX + && lock->writer_count > 1); } } diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c index 944fd2a97fc..1099dff798e 100644 --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -641,7 +641,7 @@ mutex_set_debug_info( ut_ad(mutex); ut_ad(file_name); - sync_thread_add_level(mutex, mutex->level); + sync_thread_add_level(mutex, mutex->level, FALSE); mutex->file_name = file_name; mutex->line = line; @@ -1011,8 +1011,9 @@ void sync_thread_add_level( /*==================*/ void* latch, /* in: pointer to a mutex or an rw-lock */ - ulint level) /* in: level in the latching order; if + ulint level, /* in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /* in: TRUE if re-entering an x-lock */ { sync_level_t* array; sync_level_t* slot; @@ -1060,6 +1061,10 @@ sync_thread_add_level( array = thread_slot->levels; + if (relock) { + goto levels_ok; + } + /* NOTE that there is a problem with _NODE and _LEAF levels: if the B-tree height changes, then a leaf can change to an internal node or the other way around. We do not know at present if this can cause @@ -1209,6 +1214,7 @@ sync_thread_add_level( ut_error; } +levels_ok: for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { slot = sync_thread_levels_get_nth(array, i); diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.c index 38ad53fcfb0..730ac6a6f60 100644 --- a/storage/innobase/trx/trx0rec.c +++ b/storage/innobase/trx/trx0rec.c @@ -1397,6 +1397,10 @@ trx_undo_prev_version_build( return(DB_ERROR); } +# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets)); +# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint* ext_vect; ulint n_ext_vect; diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c index a82d7f452fc..d174f1e1b37 100644 --- a/storage/innobase/trx/trx0trx.c +++ b/storage/innobase/trx/trx0trx.c @@ -41,6 +41,9 @@ sess_t* trx_dummy_sess = NULL; /* Number of transactions currently allocated for MySQL: protected by the kernel mutex */ ulint trx_n_mysql_transactions = 0; +/* Number of transactions currently in the XA PREPARED state: protected by +the kernel mutex */ +ulint trx_n_prepared = 0; /***************************************************************** Starts the transaction if it is not yet started. */ @@ -480,6 +483,7 @@ trx_lists_init_at_db_start(void) if (srv_force_recovery == 0) { trx->conc_state = TRX_PREPARED; + trx_n_prepared++; } else { fprintf(stderr, "InnoDB: Since" @@ -558,6 +562,7 @@ trx_lists_init_at_db_start(void) trx->conc_state = TRX_PREPARED; + trx_n_prepared++; } else { fprintf(stderr, "InnoDB: Since" @@ -832,6 +837,11 @@ trx_commit_off_kernel( || trx->conc_state == TRX_PREPARED); ut_ad(mutex_own(&kernel_mutex)); + if (UNIV_UNLIKELY(trx->conc_state == TRX_PREPARED)) { + ut_a(trx_n_prepared > 0); + trx_n_prepared--; + } + /* The following assignment makes the transaction committed in memory and makes its changes to data visible to other transactions. NOTE that there is a small discrepancy from the strict formal @@ -1882,6 +1892,7 @@ trx_prepare_off_kernel( /*--------------------------------------*/ trx->conc_state = TRX_PREPARED; + trx_n_prepared++; /*--------------------------------------*/ if (must_flush_log) { |