diff options
Diffstat (limited to 'innobase')
73 files changed, 2596 insertions, 882 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index 51c164b7cef..d8a0959e47f 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -822,9 +822,16 @@ btr_page_reorganize_low( { page_t* new_page; ulint log_mode; + ulint data_size1; + ulint data_size2; + ulint max_ins_size1; + ulint max_ins_size2; ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); + data_size1 = page_get_data_size(page); + max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); + /* Write the log record */ mlog_write_initial_log_record(page, MLOG_PAGE_REORGANIZE, mtr); @@ -859,6 +866,19 @@ btr_page_reorganize_low( lock_move_reorganize_page(page, new_page); } + data_size2 = page_get_data_size(page); + max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1); + + if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) { + buf_page_print(page); + buf_page_print(new_page); + fprintf(stderr, +"InnoDB: Error: page old data size %lu new data size %lu\n" +"InnoDB: Error: page old max ins size %lu new max ins size %lu\n" +"InnoDB: Make a detailed bug report and send it to mysql@lists.mysql.com\n", + data_size1, data_size2, max_ins_size1, max_ins_size2); + } + buf_frame_free(new_page); /* Restore logging mode */ @@ -1945,11 +1965,20 @@ btr_compress( btr_page_reorganize(merge_page, mtr); + max_ins_size = page_get_max_insert_size(merge_page, n_recs); + ut_ad(page_validate(merge_page, cursor->index)); ut_ad(page_get_max_insert_size(merge_page, n_recs) == max_ins_size_reorg); } + if (data_size > max_ins_size) { + + /* Add fault tolerance, though this should never happen */ + + return; + } + btr_search_drop_page_hash_index(page); /* Remove the page from the level list */ diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index 69952c842ce..f6b4a2964f5 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -1110,6 +1110,10 @@ btr_cur_pessimistic_insert( if (big_rec_vec == NULL) { + if (n_extents > 0) { + fil_space_release_free_extents(index->space, + n_extents); + } return(DB_TOO_BIG_RECORD); } } @@ -1367,7 +1371,8 @@ btr_cur_update_sec_rec_in_place( } /***************************************************************** -Updates a record when the update causes no size changes in its fields. */ +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. */ ulint btr_cur_update_in_place( @@ -1458,7 +1463,8 @@ btr_cur_update_in_place( Tries to update a record on a page in an index tree. It is assumed that mtr holds an x-latch on the page. The operation does not succeed if there is too little space on the page or if the update would result in too empty a page, -so that tree compression is recommended. */ +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. */ ulint btr_cur_optimistic_update( @@ -1510,10 +1516,11 @@ btr_cur_optimistic_update( ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - if (!row_upd_changes_field_size(rec, index, update)) { + if (!row_upd_changes_field_size_or_external(rec, index, update)) { - /* The simplest and most common case: the update does not - change the size of any field */ + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update */ return(btr_cur_update_in_place(flags, cursor, update, cmpl_info, thr, mtr)); @@ -1542,7 +1549,7 @@ btr_cur_optimistic_update( new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); - row_upd_clust_index_replace_new_col_vals(new_entry, update); + row_upd_index_replace_new_col_vals(new_entry, index, update, NULL); old_rec_size = rec_get_size(rec); new_rec_size = rec_get_converted_size(new_entry); @@ -1672,54 +1679,13 @@ btr_cur_pess_upd_restore_supremum( lock_rec_reset_and_inherit_gap_locks(page_get_supremum_rec(prev_page), rec); } - -/*************************************************************** -Replaces and copies the data in the new column values stored in the -update vector to the clustered index entry given. */ -static -void -btr_cur_copy_new_col_vals( -/*======================*/ - dtuple_t* entry, /* in/out: index entry where replaced */ - upd_t* update, /* in: update vector */ - mem_heap_t* heap) /* in: heap where data is copied */ -{ - upd_field_t* upd_field; - dfield_t* dfield; - dfield_t* new_val; - ulint field_no; - byte* data; - ulint i; - - dtuple_set_info_bits(entry, update->info_bits); - - for (i = 0; i < upd_get_n_fields(update); i++) { - - upd_field = upd_get_nth_field(update, i); - - field_no = upd_field->field_no; - - dfield = dtuple_get_nth_field(entry, field_no); - - new_val = &(upd_field->new_val); - - if (new_val->len == UNIV_SQL_NULL) { - data = NULL; - } else { - data = mem_heap_alloc(heap, new_val->len); - - ut_memcpy(data, new_val->data, new_val->len); - } - - dfield_set_data(dfield, data, new_val->len); - } -} /***************************************************************** Performs an update of a record on a page of a tree. It is assumed that mtr holds an x-latch on the tree and on the cursor page. If the update is made on the leaf level, to avoid deadlocks, mtr must also -own x-latches to brothers of page, if those brothers exist. */ +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. */ ulint btr_cur_pessimistic_update( @@ -1816,7 +1782,7 @@ btr_cur_pessimistic_update( new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); - btr_cur_copy_new_col_vals(new_entry, update, heap); + row_upd_index_replace_new_col_vals(new_entry, index, update, heap); if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, @@ -1825,21 +1791,6 @@ btr_cur_pessimistic_update( trx->id); } - page_cursor = btr_cur_get_page_cur(cursor); - - /* Store state of explicit locks on rec on the page infimum record, - before deleting rec. The page infimum acts as a dummy carrier of the - locks, taking care also of lock releases, before we can move the locks - back on the actual record. There is a special case: if we are - inserting on the root page and the insert causes a call of - btr_root_raise_and_insert. Therefore we cannot in the lock system - delete the lock structs set on the root page even if the root - page carries just node pointers. */ - - lock_rec_store_on_page_infimum(rec); - - btr_search_update_hash_on_delete(cursor); - if (flags & BTR_NO_UNDO_LOG_FLAG) { /* We are in a transaction rollback undoing a row update: we must free possible externally stored fields @@ -1860,10 +1811,6 @@ btr_cur_pessimistic_update( ext_vect = mem_heap_alloc(heap, sizeof(ulint) * rec_get_n_fields(rec)); n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, update); - page_cur_delete_rec(page_cursor, mtr); - - page_cur_move_to_prev(page_cursor); - if ((rec_get_converted_size(new_entry) >= page_get_free_space_of_empty() / 2) || (rec_get_converted_size(new_entry) >= REC_MAX_DATA_SIZE)) { @@ -1874,10 +1821,31 @@ btr_cur_pessimistic_update( mem_heap_free(heap); + err = DB_TOO_BIG_RECORD; + goto return_after_reservations; } } + page_cursor = btr_cur_get_page_cur(cursor); + + /* Store state of explicit locks on rec on the page infimum record, + before deleting rec. The page infimum acts as a dummy carrier of the + locks, taking care also of lock releases, before we can move the locks + back on the actual record. There is a special case: if we are + inserting on the root page and the insert causes a call of + btr_root_raise_and_insert. Therefore we cannot in the lock system + delete the lock structs set on the root page even if the root + page carries just node pointers. */ + + lock_rec_store_on_page_infimum(rec); + + btr_search_update_hash_on_delete(cursor); + + page_cur_delete_rec(page_cursor, mtr); + + page_cur_move_to_prev(page_cursor); + rec = btr_cur_insert_if_possible(cursor, new_entry, &dummy_reorganized, mtr); ut_a(rec || optim_err != DB_UNDERFLOW); @@ -3372,8 +3340,8 @@ btr_free_externally_stored_field( page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO); - offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET); - + offset = mach_read_from_4(data + local_len + + BTR_EXTERN_OFFSET); extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4); diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index 13efacb9da3..39e70d91be8 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -381,6 +381,8 @@ btr_pcur_move_to_next_page( btr_leaf_page_release(page, cursor->latch_mode, mtr); page_cur_set_before_first(next_page, btr_pcur_get_page_cur(cursor)); + + page_check_dir(next_page); } /************************************************************* diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index 1cdddaf6cb4..9d920c7c0d8 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -34,6 +34,7 @@ Created 11/5/1995 Heikki Tuuri #include "ibuf0ibuf.h" #include "dict0dict.h" #include "log0recv.h" +#include "log0log.h" #include "trx0undo.h" #include "srv0srv.h" @@ -231,12 +232,12 @@ ibool buf_debug_prints = FALSE; /* If this is set TRUE, /************************************************************************ Calculates a page checksum which is stored to the page when it is written -to a file. Note that we must be careful to calculate the same value -on 32-bit and 64-bit architectures. */ +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. */ ulint -buf_calc_page_checksum( -/*===================*/ +buf_calc_page_new_checksum( +/*=======================*/ /* out: checksum */ byte* page) /* in: buffer page */ { @@ -244,12 +245,39 @@ buf_calc_page_checksum( /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO are written outside the buffer pool to the first pages of data - files, we have to skip them in page checksum calculation */ + files, we have to skip them in the page checksum calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + checksum = checksum & 0xFFFFFFFF; + + return(checksum); +} + +/************************************************************************ +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! */ + +ulint +buf_calc_page_old_checksum( +/*=======================*/ + /* out: checksum */ + byte* page) /* in: buffer page */ +{ + ulint checksum; checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - + ut_fold_binary(page + FIL_PAGE_DATA, - UNIV_PAGE_SIZE - FIL_PAGE_DATA - - FIL_PAGE_END_LSN); + checksum = checksum & 0xFFFFFFFF; return(checksum); @@ -265,27 +293,69 @@ buf_page_is_corrupted( byte* read_buf) /* in: a database page */ { ulint checksum; + ulint old_checksum; + ulint checksum_field; + ulint old_checksum_field; + dulint current_lsn; - checksum = buf_calc_page_checksum(read_buf); + if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) + != mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { - /* Note that InnoDB initializes empty pages to zero, and - early versions of InnoDB did not store page checksum to - the 4 most significant bytes of the page lsn field at the - end of a page: */ + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(TRUE); + } + +#ifndef UNIV_HOTBACKUP + if (recv_lsn_checks_on && log_peek_lsn(¤t_lsn)) { + if (ut_dulint_cmp(current_lsn, + mach_read_from_8(read_buf + FIL_PAGE_LSN)) + < 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: page %lu log sequence number %lu %lu\n" +"InnoDB: is in the future! Current system log sequence number %lu %lu.\n" +"InnoDB: Your database may be corrupt.\n", + mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + ut_dulint_get_high( + mach_read_from_8(read_buf + FIL_PAGE_LSN)), + ut_dulint_get_low( + mach_read_from_8(read_buf + FIL_PAGE_LSN)), + ut_dulint_get_high(current_lsn), + ut_dulint_get_low(current_lsn)); + } + } +#endif + old_checksum = buf_calc_page_old_checksum(read_buf); + + old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + /* There are 2 valid formulas for old_checksum_field: + 1. Very old versions of InnoDB only stored 8 byte lsn to the start + and the end of the page. + 2. Newer InnoDB versions store the old formula checksum there. */ - if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) - != mach_read_from_4(read_buf + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN + 4)) - || (checksum != mach_read_from_4(read_buf - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN) - && mach_read_from_4(read_buf + FIL_PAGE_LSN) - != mach_read_from_4(read_buf - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN))) { + if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && old_checksum_field != old_checksum) { + return(TRUE); } + checksum = buf_calc_page_new_checksum(read_buf); + checksum_field = mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ + + if (checksum_field != 0 && checksum_field != checksum) { + + return(TRUE); + } + return(FALSE); } @@ -299,6 +369,7 @@ buf_page_print( { dict_index_t* index; ulint checksum; + ulint old_checksum; char* buf; buf = mem_alloc(4 * UNIV_PAGE_SIZE); @@ -313,19 +384,23 @@ buf_page_print( mem_free(buf); - checksum = buf_calc_page_checksum(read_buf); + checksum = buf_calc_page_new_checksum(read_buf); + old_checksum = buf_calc_page_old_checksum(read_buf); ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Page checksum %lu stored checksum %lu\n", - checksum, mach_read_from_4(read_buf - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN)); + fprintf(stderr, +" InnoDB: Page checksum %lu, prior-to-4.0.14-form checksum %lu\n" +"InnoDB: stored checksum %lu, prior-to-4.0.14-form stored checksum %lu\n", + checksum, old_checksum, + mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM)); fprintf(stderr, "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n", mach_read_from_4(read_buf + FIL_PAGE_LSN), mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), mach_read_from_4(read_buf + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN + 4)); + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)); if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT) { fprintf(stderr, diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 47ac9c6b041..5102674a8df 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -361,21 +361,29 @@ buf_flush_init_for_writing( ulint space, /* in: space id */ ulint page_no) /* in: page number */ { - /* Write the newest modification lsn to the page */ - mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); + UT_NOT_USED(space); - mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, newest_lsn); + /* Write the newest modification lsn to the page header and trailer */ + mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); - /* Write to the page the space id and page number */ + mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + newest_lsn); + /* Write the page number */ - mach_write_to_4(page + FIL_PAGE_SPACE, space); mach_write_to_4(page + FIL_PAGE_OFFSET, page_no); + /* Store the new formula checksum */ + + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + buf_calc_page_new_checksum(page)); + /* We overwrite the first 4 bytes of the end lsn field to store - a page checksum */ + the old formula checksum. Since it depends also on the field + FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the + new formula checksum. */ - mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, - buf_calc_page_checksum(page)); + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + buf_calc_page_old_checksum(page)); } /************************************************************************ diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c index 8ab5acb4da7..f2f94cc47ce 100644 --- a/innobase/data/data0data.c +++ b/innobase/data/data0data.c @@ -584,8 +584,7 @@ dtuple_convert_big_rec( * sizeof(big_rec_field_t)); /* Decide which fields to shorten: the algorithm is to look for - the longest field which does not occur in the ordering part - of any index on the table */ + the longest field whose type is DATA_BLOB */ n_fields = 0; @@ -610,12 +609,9 @@ dtuple_convert_big_rec( } } - /* Skip over fields which are ordering in some index */ - - if (!is_externally_stored && - dict_field_get_col( - dict_index_get_nth_field(index, i)) - ->ord_part == 0) { + if (!is_externally_stored + && dict_index_get_nth_type(index, i)->mtype + == DATA_BLOB) { dfield = dtuple_get_nth_field(entry, i); @@ -629,9 +625,13 @@ dtuple_convert_big_rec( } } - if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10 - + REC_1BYTE_OFFS_LIMIT) { + /* We do not store externally fields which are smaller than + DICT_MAX_COL_PREFIX_LEN */ + + ut_a(DICT_MAX_COL_PREFIX_LEN > REC_1BYTE_OFFS_LIMIT); + if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10 + + DICT_MAX_COL_PREFIX_LEN) { /* Cannot shorten more */ mem_heap_free(heap); @@ -644,13 +644,19 @@ dtuple_convert_big_rec( drop below 128 which is the limit for the 2-byte offset storage format in a physical record. This we accomplish by storing 128 bytes of data in entry - itself, and only the remaining part to big rec vec. */ + itself, and only the remaining part to big rec vec. + + We store the first bytes locally to the record. Then + we can calculate all ordering fields in all indexes + from locally stored data. */ dfield = dtuple_get_nth_field(entry, longest_i); vector->fields[n_fields].field_no = longest_i; + ut_a(dfield->len > DICT_MAX_COL_PREFIX_LEN); + vector->fields[n_fields].len = dfield->len - - REC_1BYTE_OFFS_LIMIT; + - DICT_MAX_COL_PREFIX_LEN; vector->fields[n_fields].data = mem_heap_alloc(heap, vector->fields[n_fields].len); diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c index 5d0ddf3e887..df430f06bcb 100644 --- a/innobase/data/data0type.c +++ b/innobase/data/data0type.c @@ -85,8 +85,6 @@ dtype_print( printf("DATA_MIX_ID"); } else if (prtype == DATA_ENGLISH) { printf("DATA_ENGLISH"); - } else if (prtype == DATA_FINNISH) { - printf("DATA_FINNISH"); } else { printf("prtype %lu", mtype); } diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index 374c567c3ca..0bf2ace3324 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -276,7 +276,7 @@ dict_boot(void) DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 1); - dict_mem_index_add_field(index, (char *) "NAME", 0); + dict_mem_index_add_field(index, (char *) "NAME", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLES, MLOG_4BYTES, &mtr); @@ -287,7 +287,7 @@ dict_boot(void) index = dict_mem_index_create((char *) "SYS_TABLES", (char *) "ID_IND", DICT_HDR_SPACE, DICT_UNIQUE, 1); - dict_mem_index_add_field(index, (char *) "ID", 0); + dict_mem_index_add_field(index, (char *) "ID", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLE_IDS, MLOG_4BYTES, &mtr); @@ -313,8 +313,8 @@ dict_boot(void) (char *) "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); - dict_mem_index_add_field(index, (char *) "TABLE_ID", 0); - dict_mem_index_add_field(index, (char *) "POS", 0); + dict_mem_index_add_field(index, (char *) "TABLE_ID", 0, 0); + dict_mem_index_add_field(index, (char *) "POS", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_COLUMNS, MLOG_4BYTES, &mtr); @@ -343,8 +343,8 @@ dict_boot(void) (char *) "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); - dict_mem_index_add_field(index, (char *) "TABLE_ID", 0); - dict_mem_index_add_field(index, (char *) "ID", 0); + dict_mem_index_add_field(index, (char *) "TABLE_ID", 0, 0); + dict_mem_index_add_field(index, (char *) "ID", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_INDEXES, MLOG_4BYTES, &mtr); @@ -365,8 +365,8 @@ dict_boot(void) (char *) "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); - dict_mem_index_add_field(index, (char *) "INDEX_ID", 0); - dict_mem_index_add_field(index, (char *) "POS", 0); + dict_mem_index_add_field(index, (char *) "INDEX_ID", 0, 0); + dict_mem_index_add_field(index, (char *) "POS", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_FIELDS, MLOG_4BYTES, &mtr); diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index 3619ac02f4d..9139e589a0a 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -337,7 +337,7 @@ dict_create_index_for_cluster_step( for (i = 0; i < table->n_cols; i++) { col = dict_table_get_nth_col(table, i); - dict_mem_index_add_field(index, col->name, 0); + dict_mem_index_add_field(index, col->name, 0, 0); } (node->cluster)->index = index; @@ -450,9 +450,17 @@ dict_create_sys_fields_tuple( dict_field_t* field; dfield_t* dfield; byte* ptr; + ibool index_contains_column_prefix_field = FALSE; + ulint j; ut_ad(index && heap); + for (j = 0; j < index->n_fields; j++) { + if (dict_index_get_nth_field(index, j)->prefix_len > 0) { + index_contains_column_prefix_field = TRUE; + } + } + field = dict_index_get_nth_field(index, i); sys_fields = dict_sys->sys_fields; @@ -466,11 +474,25 @@ dict_create_sys_fields_tuple( mach_write_to_8(ptr, index->id); dfield_set_data(dfield, ptr, 8); - /* 1: POS ----------------------------*/ + /* 1: POS + PREFIX LENGTH ----------------------------*/ + dfield = dtuple_get_nth_field(entry, 1); ptr = mem_heap_alloc(heap, 4); - mach_write_to_4(ptr, i); + + if (index_contains_column_prefix_field) { + /* If there are column prefix fields in the index, then + we store the number of the field to the 2 HIGH bytes + and the prefix length to the 2 low bytes, */ + + mach_write_to_4(ptr, (i << 16) + field->prefix_len); + } else { + /* Else we store the number of the field to the 2 LOW bytes. + This is to keep the storage format compatible with + InnoDB versions < 4.0.14. */ + + mach_write_to_4(ptr, i); + } dfield_set_data(dfield, ptr, 4); /* 4: COL_NAME -------------------------*/ diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index c11a5f76d94..924fa3ecf95 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -88,15 +88,6 @@ dict_index_remove_from_cache( dict_table_t* table, /* in: table */ dict_index_t* index); /* in, own: index */ /*********************************************************************** -Adds a column to index. */ -UNIV_INLINE -void -dict_index_add_col( -/*===============*/ - dict_index_t* index, /* in: index */ - dict_col_t* col, /* in: column */ - ulint order); /* in: order criterion */ -/*********************************************************************** Copies fields contained in index2 to index1. */ static void @@ -482,8 +473,9 @@ dict_index_get_nth_col_pos( ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + col = dict_table_get_nth_col(index->table, n); + if (index->type & DICT_CLUSTERED) { - col = dict_table_get_nth_col(index->table, n); return(col->clust_pos); } @@ -492,9 +484,8 @@ dict_index_get_nth_col_pos( for (pos = 0; pos < n_fields; pos++) { field = dict_index_get_nth_field(index, pos); - col = field->col; - if (dict_col_get_no(col) == n) { + if (col == field->col && field->prefix_len == 0) { return(pos); } @@ -502,7 +493,86 @@ dict_index_get_nth_col_pos( return(ULINT_UNDEFINED); } + +/************************************************************************ +Returns TRUE if the index contains a column or a prefix of that column. */ + +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + /* out: TRUE if contains the column or its + prefix */ + dict_index_t* index, /* in: index */ + ulint n) /* in: column number */ +{ + dict_field_t* field; + dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + if (index->type & DICT_CLUSTERED) { + + return(TRUE); + } + + col = dict_table_get_nth_col(index->table, n); + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************ +Looks for a matching field in an index. The column and the prefix len have +to be the same. */ + +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + /* out: position in internal representation + of the index; if not contained, returns + ULINT_UNDEFINED */ + dict_index_t* index, /* in: index from which to search */ + dict_index_t* index2, /* in: index */ + ulint n) /* in: field number in index2 */ +{ + dict_field_t* field; + dict_field_t* field2; + ulint n_fields; + ulint pos; + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + field2 = dict_index_get_nth_field(index2, n); + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (field->col == field2->col + && field->prefix_len == field2->prefix_len) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + /************************************************************************** Returns a table object, based on table id, and memoryfixes it. */ @@ -622,8 +692,7 @@ dict_table_get( } /************************************************************************** -Returns a table object and increments MySQL open handle count on the table. -*/ +Returns a table object and increments MySQL open handle count on the table. */ dict_table_t* dict_table_get_and_increment_handle_count( @@ -732,11 +801,12 @@ dict_table_add_to_cache( } /* Add table to hash table of tables */ - HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, table); + HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + table); /* Add table to hash table of tables based on table id */ HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold, - table); + table); /* Add table to LRU list of tables */ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); @@ -828,7 +898,7 @@ dict_table_rename_in_cache( /* Remove table from the hash tables of tables */ HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, - ut_fold_string(table->name), table); + ut_fold_string(table->name), table); name_buf = mem_heap_alloc(table->heap, ut_strlen(new_name) + 1); @@ -837,7 +907,8 @@ dict_table_rename_in_cache( table->name = name_buf; /* Add table to hash table of tables */ - HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, table); + HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + table); dict_sys->size += (mem_heap_get_size(table->heap) - old_size); @@ -1128,7 +1199,6 @@ dict_index_add_to_cache( ulint n_ord; ibool success; ulint i; - ulint j; ut_ad(index); ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1158,28 +1228,6 @@ dict_index_add_to_cache( return(FALSE); } - - /* Check that the same column does not appear twice in the index. - InnoDB assumes this in its algorithms, e.g., update of an index - entry */ - - for (i = 0; i < dict_index_get_n_fields(index); i++) { - - for (j = 0; j < i; j++) { - if (dict_index_get_nth_field(index, j)->col - == dict_index_get_nth_field(index, i)->col) { - - ut_print_timestamp(stderr); - - fprintf(stderr, -" InnoDB: Error: column %s appears twice in index %s of table %s\n" -"InnoDB: This is not allowed in InnoDB.\n" -"InnoDB: UPDATE can cause such an index to become corrupt in InnoDB.\n", - dict_index_get_nth_field(index, i)->col->name, - index->name, table->name); - } - } - } /* Build the cache internal representation of the index, containing also the added system fields */ @@ -1223,8 +1271,8 @@ dict_index_add_to_cache( cluster = dict_table_get_low(table->cluster_name); - tree = dict_index_get_tree(UT_LIST_GET_FIRST(cluster->indexes)); - + tree = dict_index_get_tree( + UT_LIST_GET_FIRST(cluster->indexes)); new_index->tree = tree; new_index->page_no = tree->page; } else { @@ -1352,13 +1400,14 @@ UNIV_INLINE void dict_index_add_col( /*===============*/ - dict_index_t* index, /* in: index */ - dict_col_t* col, /* in: column */ - ulint order) /* in: order criterion */ + dict_index_t* index, /* in: index */ + dict_col_t* col, /* in: column */ + ulint order, /* in: order criterion */ + ulint prefix_len) /* in: column prefix length */ { dict_field_t* field; - dict_mem_index_add_field(index, col->name, order); + dict_mem_index_add_field(index, col->name, order, prefix_len); field = dict_index_get_nth_field(index, index->n_def - 1); @@ -1384,7 +1433,8 @@ dict_index_copy( for (i = start; i < end; i++) { field = dict_index_get_nth_field(index2, i); - dict_index_add_col(index1, field->col, field->order); + dict_index_add_col(index1, field->col, field->order, + field->prefix_len); } } @@ -1487,7 +1537,7 @@ dict_index_build_internal_clust( /* Add the mix id column */ dict_index_add_col(new_index, - dict_table_get_sys_col(table, DATA_MIX_ID), 0); + dict_table_get_sys_col(table, DATA_MIX_ID), 0, 0); /* Copy the rest of fields */ dict_index_copy(new_index, index, table->mix_len, @@ -1525,14 +1575,15 @@ dict_index_build_internal_clust( if (!(index->type & DICT_UNIQUE)) { dict_index_add_col(new_index, - dict_table_get_sys_col(table, DATA_ROW_ID), 0); + dict_table_get_sys_col(table, DATA_ROW_ID), 0, 0); trx_id_pos++; } dict_index_add_col(new_index, - dict_table_get_sys_col(table, DATA_TRX_ID), 0); + dict_table_get_sys_col(table, DATA_TRX_ID), 0, 0); + dict_index_add_col(new_index, - dict_table_get_sys_col(table, DATA_ROLL_PTR), 0); + dict_table_get_sys_col(table, DATA_ROLL_PTR), 0, 0); for (i = 0; i < trx_id_pos; i++) { @@ -1561,7 +1612,14 @@ dict_index_build_internal_clust( for (i = 0; i < new_index->n_def; i++) { field = dict_index_get_nth_field(new_index, i); - (field->col)->aux = 0; + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + field->col->aux = 0; + } } /* Add to new_index non-system columns of table not yet included @@ -1572,7 +1630,7 @@ dict_index_build_internal_clust( ut_ad(col->type.mtype != DATA_SYS); if (col->aux == ULINT_UNDEFINED) { - dict_index_add_col(new_index, col, 0); + dict_index_add_col(new_index, col, 0, 0); } } @@ -1584,7 +1642,11 @@ dict_index_build_internal_clust( for (i = 0; i < new_index->n_def; i++) { field = dict_index_get_nth_field(new_index, i); - (field->col)->clust_pos = i; + + if (field->prefix_len == 0) { + + field->col->clust_pos = i; + } } new_index->cached = TRUE; @@ -1646,25 +1708,33 @@ dict_index_build_internal_non_clust( for (i = 0; i < clust_index->n_uniq; i++) { field = dict_index_get_nth_field(clust_index, i); - (field->col)->aux = ULINT_UNDEFINED; + field->col->aux = ULINT_UNDEFINED; } /* Mark with 0 table columns already contained in new_index */ for (i = 0; i < new_index->n_def; i++) { field = dict_index_get_nth_field(new_index, i); - (field->col)->aux = 0; + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + field->col->aux = 0; + } } - /* Add to new_index columns necessary to determine the clustered + /* Add to new_index the columns necessary to determine the clustered index entry uniquely */ for (i = 0; i < clust_index->n_uniq; i++) { field = dict_index_get_nth_field(clust_index, i); - if ((field->col)->aux == ULINT_UNDEFINED) { - dict_index_add_col(new_index, field->col, 0); + if (field->col->aux == ULINT_UNDEFINED) { + dict_index_add_col(new_index, field->col, 0, + field->prefix_len); } } @@ -1787,6 +1857,14 @@ dict_foreign_find_index( for (i = 0; i < n_cols; i++) { col_name = dict_index_get_nth_field(index, i) ->col->name; + if (dict_index_get_nth_field(index, i) + ->prefix_len != 0) { + /* We do not accept column prefix + indexes here */ + + break; + } + if (ut_strlen(columns[i]) != ut_strlen(col_name) || 0 != ut_cmp_in_lower_case(columns[i], @@ -2327,9 +2405,12 @@ dict_strip_comments( ptr = str; for (;;) { +scan_more: if (*sptr == '\0') { *ptr = '\0'; + ut_a(ptr <= str + strlen(sql_string)); + return(str); } @@ -2343,7 +2424,7 @@ dict_strip_comments( || *sptr == (char)0x0D || *sptr == '\0') { - break; + goto scan_more; } sptr++; @@ -2357,12 +2438,12 @@ dict_strip_comments( sptr += 2; - break; + goto scan_more; } if (*sptr == '\0') { - break; + goto scan_more; } sptr++; @@ -3776,6 +3857,10 @@ dict_field_print_low( ut_ad(mutex_own(&(dict_sys->mutex))); printf(" %s", field->name); + + if (field->prefix_len != 0) { + printf("(%lu)", field->prefix_len); + } } /************************************************************************** diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index 999eb55bb20..d5c51a43747 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -301,6 +301,8 @@ dict_load_fields( dtuple_t* tuple; dfield_t* dfield; char* col_name; + ulint pos_and_prefix_len; + ulint prefix_len; rec_t* rec; byte* field; ulint len; @@ -345,8 +347,28 @@ dict_load_fields( ut_a(ut_memcmp(buf, field, len) == 0); field = rec_get_nth_field(rec, 1, &len); - ut_ad(len == 4); - ut_a(i == mach_read_from_4(field)); + ut_a(len == 4); + + /* The next field stores the field position in the index + and a possible column prefix length if the index field + does not contain the whole column. The storage format is + like this: if there is at least one prefix field in the index, + then the HIGH 2 bytes contain the field number (== i) and the + low 2 bytes the prefix length for the field. Otherwise the + field number (== i) is contained in the 2 LOW bytes. */ + + pos_and_prefix_len = mach_read_from_4(field); + + ut_a((pos_and_prefix_len & 0xFFFF) == i + || (pos_and_prefix_len & 0xFFFF0000) == (i << 16)); + + if ((i == 0 && pos_and_prefix_len > 0) + || (pos_and_prefix_len & 0xFFFF0000) > 0) { + + prefix_len = pos_and_prefix_len & 0xFFFF; + } else { + prefix_len = 0; + } ut_a(0 == ut_strcmp((char*) "COL_NAME", dict_field_get_col( @@ -359,7 +381,7 @@ dict_load_fields( ut_memcpy(col_name, field, len); col_name[len] = '\0'; - dict_mem_index_add_field(index, col_name, 0); + dict_mem_index_add_field(index, col_name, 0, prefix_len); btr_pcur_move_to_next_user_rec(&pcur, &mtr); } diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c index e5918c6aeb6..56efc0a0117 100644 --- a/innobase/dict/dict0mem.c +++ b/innobase/dict/dict0mem.c @@ -266,10 +266,13 @@ by the column name may be released only after publishing the index. */ void dict_mem_index_add_field( /*=====================*/ - dict_index_t* index, /* in: index */ - char* name, /* in: column name */ - ulint order) /* in: order criterion; 0 means an ascending - order */ + dict_index_t* index, /* in: index */ + char* name, /* in: column name */ + ulint order, /* in: order criterion; 0 means an + ascending order */ + ulint prefix_len) /* in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ { dict_field_t* field; @@ -282,6 +285,8 @@ dict_mem_index_add_field( field->name = name; field->order = order; + + field->prefix_len = prefix_len; } /************************************************************************** diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 98980f6c337..f55df90846c 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -632,7 +632,7 @@ fil_space_create( /* Spaces with an odd id number are reserved to replicate spaces used in log debugging */ - ut_a((purpose == FIL_LOG) || (id % 2 == 0)); + ut_anp((purpose == FIL_LOG) || (id % 2 == 0)); #endif mutex_enter(&(system->mutex)); @@ -831,6 +831,34 @@ fil_space_release_free_extents( mutex_exit(&(system->mutex)); } +/*********************************************************************** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ + +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id) /* in: space id */ +{ + fil_space_t* space; + fil_system_t* system = fil_system; + ulint n; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + ut_a(space); + + n = space->n_reserved_extents; + + mutex_exit(&(system->mutex)); + + return(n); +} + /************************************************************************ Prepares a file node for i/o. Opens the file if it is closed. Updates the pending i/o's field in the node and the system appropriately. Takes the node @@ -1202,8 +1230,8 @@ loop: /* Do aio */ - ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + ut_anp(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_anp((len % OS_FILE_LOG_BLOCK_SIZE) == 0); /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index ee48288b875..20bf4972f64 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -778,7 +778,7 @@ fsp_init_file_page_low( page[i] = 0xFF; } #endif - mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, + mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, ut_dulint_zero); mach_write_to_8(page + FIL_PAGE_LSN, ut_dulint_zero); } @@ -2709,10 +2709,52 @@ fseg_free_page_low( return; } +/* + fprintf(stderr, +"InnoDB: InnoDB is freeing space %lu page %lu,\n" +"InnoDB: which belongs to descr seg %lu %lu\n" +"InnoDB: segment %lu %lu.\n", + space, page, + ut_dulint_get_high( + mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), + ut_dulint_get_low( + mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), + ut_dulint_get_high( + mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)), + ut_dulint_get_low( + mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))); +*/ /* If we get here, the page is in some extent of the segment */ - ut_a(0 == ut_dulint_cmp( + if (0 != ut_dulint_cmp( mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr), - mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))); + mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))) { + + ut_sprintf_buf(errbuf, descr, 40); + fprintf(stderr, +"InnoDB: Dump of the tablespace extent descriptor: %s\n", errbuf); + ut_sprintf_buf(errbuf, seg_inode, 40); + fprintf(stderr, +"InnoDB: Dump of the segment inode: %s\n", errbuf); + + fprintf(stderr, +"InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n" +"InnoDB: which does not belong to segment %lu %lu but belongs\n" +"InnoDB: to segment %lu %lu.\n", + space, page, + ut_dulint_get_high( + mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), + ut_dulint_get_low( + mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), + ut_dulint_get_high( + mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)), + ut_dulint_get_low( + mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))); + + fprintf(stderr, +"InnoDB: If the InnoDB recovery crashes here, see section 6.1\n" +"InnoDB: of http://www.innodb.com/ibman.html about forcing recovery.\n"); + ut_a(0); + } not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr); @@ -2875,7 +2917,7 @@ fseg_free_step( freed yet */ ut_a(descr); - ut_a(xdes_get_bit(descr, XDES_FREE_BIT, buf_frame_get_page_no(header) + ut_anp(xdes_get_bit(descr, XDES_FREE_BIT, buf_frame_get_page_no(header) % FSP_EXTENT_SIZE, mtr) == FALSE); inode = fseg_inode_get(header, mtr); diff --git a/innobase/ha/ha0ha.c b/innobase/ha/ha0ha.c index b847798586d..eb28e15215d 100644 --- a/innobase/ha/ha0ha.c +++ b/innobase/ha/ha0ha.c @@ -293,11 +293,13 @@ ha_print_info( hash_table_t* table) /* in: hash table */ { hash_cell_t* cell; -/* ha_node_t* node; */ -/* ulint nodes = 0; */ +/* + ha_node_t* node; + ulint len = 0; + ulint max_len = 0; + ulint nodes = 0; +*/ ulint cells = 0; -/* ulint len = 0; */ -/* ulint max_len = 0; */ ulint n_bufs; ulint i; diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index 187afa17047..c07756ab308 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -170,7 +170,7 @@ dropped! So, there seems to be no problem. */ /********************************************************************** Validates the ibuf data structures when the caller owns ibuf_mutex. */ -static + ibool ibuf_validate_low(void); /*===================*/ @@ -484,8 +484,8 @@ ibuf_data_init_for_space( index = dict_mem_index_create(buf, (char *) "CLUST_IND", space, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,2); - dict_mem_index_add_field(index, (char *) "PAGE_NO", 0); - dict_mem_index_add_field(index, (char *) "TYPES", 0); + dict_mem_index_add_field(index, (char *) "PAGE_NO", 0, 0); + dict_mem_index_add_field(index, (char *) "TYPES", 0, 0); index->page_no = FSP_IBUF_TREE_ROOT_PAGE_NO; @@ -2727,7 +2727,7 @@ reset_bit: /********************************************************************** Validates the ibuf data structures when the caller owns ibuf_mutex. */ -static + ibool ibuf_validate_low(void) /*===================*/ diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h index 1d17c0e952d..506877333c3 100644 --- a/innobase/include/btr0cur.h +++ b/innobase/include/btr0cur.h @@ -690,7 +690,13 @@ and sleep this many microseconds in between */ #define BTR_CUR_RETRY_DELETE_N_TIMES 100 #define BTR_CUR_RETRY_SLEEP_TIME 50000 -/* The reference in a field of which data is stored on a different page */ +/* The reference in a field for which data is stored on a different page. +The reference is at the end of the 'locally' stored part of the field. +'Locally' means storage in the index record. +We store locally a long enough prefix of each column so that we can determine +the ordering parts of each index record without looking into the externally +stored part. */ + /*--------------------------------------*/ #define BTR_EXTERN_SPACE_ID 0 /* space id where stored */ #define BTR_EXTERN_PAGE_NO 4 /* page no where stored */ diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index b613d60ebf7..d2ee1a440c7 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -388,11 +388,24 @@ to a file. Note that we must be careful to calculate the same value on 32-bit and 64-bit architectures. */ ulint -buf_calc_page_checksum( -/*===================*/ +buf_calc_page_new_checksum( +/*=======================*/ /* out: checksum */ byte* page); /* in: buffer page */ /************************************************************************ +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! */ + +ulint +buf_calc_page_old_checksum( +/*=======================*/ + /* out: checksum */ + byte* page); /* in: buffer page */ +/************************************************************************ Checks if a page is corrupt. */ ibool diff --git a/innobase/include/data0data.h b/innobase/include/data0data.h index e0fb06e5018..889d148d3fe 100644 --- a/innobase/include/data0data.h +++ b/innobase/include/data0data.h @@ -453,8 +453,6 @@ struct dfield_struct{ void* data; /* pointer to data */ ulint len; /* data length; UNIV_SQL_NULL if SQL null; */ dtype_t type; /* type of data */ - ulint col_no; /* when building index entries, the column - number can be stored here */ }; struct dtuple_struct { diff --git a/innobase/include/data0type.h b/innobase/include/data0type.h index b53a70a8909..4da686bf2e1 100644 --- a/innobase/include/data0type.h +++ b/innobase/include/data0type.h @@ -18,14 +18,16 @@ typedef struct dtype_struct dtype_t; data type */ extern dtype_t* dtype_binary; -/* Data main types of SQL data; NOTE! character data types requiring -collation transformation must have the smallest codes! All codes must be -less than 256! */ +/* Data main types of SQL data */ #define DATA_VARCHAR 1 /* character varying */ #define DATA_CHAR 2 /* fixed length character */ #define DATA_FIXBINARY 3 /* binary string of fixed length */ #define DATA_BINARY 4 /* binary string */ -#define DATA_BLOB 5 /* binary large object */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; if + prtype & DATA_NONLATIN1 != 0 the data must + be compared by MySQL as a whole field; if + prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column */ #define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ #define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ #define DATA_SYS 8 /* system column */ @@ -34,35 +36,55 @@ binary strings */ #define DATA_FLOAT 9 #define DATA_DOUBLE 10 #define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ -#define DATA_VARMYSQL 12 /* data types for which comparisons must be */ -#define DATA_MYSQL 13 /* made by MySQL */ -#define DATA_ERROR 111 /* error value */ -#define DATA_MTYPE_MAX 255 +#define DATA_VARMYSQL 12 /* non-latin1 varying length char */ +#define DATA_MYSQL 13 /* non-latin1 fixed length char */ +#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() + requires the values are <= 63 */ /*-------------------------------------------*/ -/* Precise data types for system columns; NOTE: the values must run -from 0 up in the order given! All codes must be less than 256! */ +/* In the lowest byte in the precise type we store the MySQL type code +(not applicable for system columns). */ + +#define DATA_ENGLISH 4 /* English language character string: this + is a relic from pre-MySQL time and only used + for InnoDB's own system tables */ +#define DATA_ERROR 111 /* another relic from pre-MySQL time */ + +#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL + type from the precise type */ + +/* Precise data types for system columns and the length of those columns; +NOTE: the values must run from 0 up in the order given! All codes must +be less than 256 */ #define DATA_ROW_ID 0 /* row id: a dulint */ #define DATA_ROW_ID_LEN 6 /* stored length for row id */ + #define DATA_TRX_ID 1 /* transaction id: 6 bytes */ #define DATA_TRX_ID_LEN 6 + #define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */ #define DATA_ROLL_PTR_LEN 7 + #define DATA_MIX_ID 3 /* mixed index label: a dulint, stored in a row in a compressed form */ #define DATA_MIX_ID_LEN 9 /* maximum stored length for mix id (in a compressed dulint form) */ #define DATA_N_SYS_COLS 4 /* number of system columns defined above */ +/*-------------------------------------------*/ +/* Flags ORed to the precise data type */ #define DATA_NOT_NULL 256 /* this is ORed to the precise type when the column is declared as NOT NULL */ #define DATA_UNSIGNED 512 /* this id ORed to the precise type when we have an unsigned integer type */ +#define DATA_BINARY_TYPE 1024 /* if the data type is a binary character + string, this is ORed to the precise type: + this only holds for tables created with + >= MySQL-4.0.14 */ +#define DATA_NONLATIN1 2048 /* if the data type is a DATA_BLOB (actually + TEXT) of a non-latin1 type, this is ORed to + the precise type: this only holds for tables + created with >= MySQL-4.0.14 */ /*-------------------------------------------*/ -/* Precise types of a char or varchar data. All codes must be less than 256! */ -#define DATA_ENGLISH 4 /* English language character string */ -#define DATA_FINNISH 5 /* Finnish */ -#define DATA_PRTYPE_MAX 255 - /* This many bytes we need to store the type information affecting the alphabetical order for a single field and decide the storage size of an SQL null*/ @@ -123,7 +145,7 @@ dtype_get_pad_char( /*===============*/ /* out: padding character code, or ULINT_UNDEFINED if no padding specified */ - dtype_t* type); /* in: typeumn */ + dtype_t* type); /* in: type */ /*************************************************************************** Returns the size of a fixed size data type, 0 if not a fixed size type. */ UNIV_INLINE @@ -150,24 +172,24 @@ dtype_is_fixed_size( /* out: TRUE if fixed size */ dtype_t* type); /* in: type */ /************************************************************************** -Stores to a type the information which determines its alphabetical -ordering. */ +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. */ UNIV_INLINE void dtype_store_for_order_and_null_size( /*================================*/ byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE - bytes */ + bytes where we store the info */ dtype_t* type); /* in: type struct */ /************************************************************************** -Reads of a type the stored information which determines its alphabetical -ordering. */ +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ UNIV_INLINE void dtype_read_for_order_and_null_size( /*===============================*/ dtype_t* type, /* in: type struct */ - byte* buf); /* in: buffer for type order info */ + byte* buf); /* in: buffer for the stored order info */ /************************************************************************* Validates a data type structure. */ diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic index d82d976d076..ddd0b0ae8cc 100644 --- a/innobase/include/data0type.ic +++ b/innobase/include/data0type.ic @@ -110,7 +110,9 @@ dtype_get_pad_char( if (type->mtype == DATA_CHAR || type->mtype == DATA_VARCHAR || type->mtype == DATA_BINARY - || type->mtype == DATA_FIXBINARY) { + || type->mtype == DATA_FIXBINARY + || type->mtype == DATA_MYSQL + || type->mtype == DATA_VARMYSQL) { /* Space is the padding character for all char and binary strings */ @@ -124,39 +126,56 @@ dtype_get_pad_char( } /************************************************************************** -Stores to a type the information which determines its alphabetical -ordering. */ +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. */ UNIV_INLINE void dtype_store_for_order_and_null_size( /*================================*/ byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE - bytes */ + bytes where we store the info */ dtype_t* type) /* in: type struct */ { ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); buf[0] = (byte)(type->mtype & 0xFF); + + if (type->prtype & DATA_BINARY_TYPE) { + buf[0] = buf[0] | 128; + } + + if (type->prtype & DATA_NONLATIN1) { + buf[0] = buf[0] | 64; + } + buf[1] = (byte)(type->prtype & 0xFF); mach_write_to_2(buf + 2, type->len & 0xFFFF); } /************************************************************************** -Reads of a type the stored information which determines its alphabetical -ordering. */ +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ UNIV_INLINE void dtype_read_for_order_and_null_size( /*===============================*/ dtype_t* type, /* in: type struct */ - byte* buf) /* in: buffer for type order info */ + byte* buf) /* in: buffer for stored type order info */ { ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); - type->mtype = buf[0]; + type->mtype = buf[0] & 63; type->prtype = buf[1]; + if (buf[0] & 128) { + type->prtype = type->prtype | DATA_BINARY_TYPE; + } + + if (buf[0] & 64) { + type->prtype = type->prtype | DATA_NONLATIN1; + } + type->len = mach_read_from_2(buf + 2); } diff --git a/innobase/include/db0err.h b/innobase/include/db0err.h index ab7d0caa35c..854b9794c00 100644 --- a/innobase/include/db0err.h +++ b/innobase/include/db0err.h @@ -44,8 +44,10 @@ Created 5/24/1996 Heikki Tuuri #define DB_CORRUPTION 39 /* data structure corruption noticed */ #define DB_COL_APPEARS_TWICE_IN_INDEX 40 /* InnoDB cannot handle an index where same column appears twice */ -#define DB_CANNOT_DROP_CONSTRAINT 40 /* dropping a foreign key constraint +#define DB_CANNOT_DROP_CONSTRAINT 41 /* dropping a foreign key constraint from a table failed */ +#define DB_NO_SAVEPOINT 42 /* no savepoint exists with the given + name */ /* The following are partial failure codes */ #define DB_FAIL 1000 diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index 97486a7c2f6..b5ec5381db2 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -569,6 +569,29 @@ dict_index_get_nth_col_pos( dict_index_t* index, /* in: index */ ulint n); /* in: column number */ /************************************************************************ +Returns TRUE if the index contains a column or a prefix of that column. */ + +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + /* out: TRUE if contains the column or its + prefix */ + dict_index_t* index, /* in: index */ + ulint n); /* in: column number */ +/************************************************************************ +Looks for a matching field in an index. The column and the prefix len has +to be the same. */ + +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + /* out: position in internal representation + of the index; if not contained, returns + ULINT_UNDEFINED */ + dict_index_t* index, /* in: index from which to search */ + dict_index_t* index2, /* in: index */ + ulint n); /* in: field number in index2 */ +/************************************************************************ Looks for column n position in the clustered index. */ ulint diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic index 71ea67117a7..c5982c162a7 100644 --- a/innobase/include/dict0dict.ic +++ b/innobase/include/dict0dict.ic @@ -203,7 +203,6 @@ dict_index_get_n_fields( { ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); - ut_ad(index->cached); return(index->n_fields); } diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index 0798541cfe0..03dc913a7c9 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -111,10 +111,13 @@ by the column name may be released only after publishing the index. */ void dict_mem_index_add_field( /*=====================*/ - dict_index_t* index, /* in: index */ - char* name, /* in: column name */ - ulint order); /* in: order criterion; 0 means an ascending - order */ + dict_index_t* index, /* in: index */ + char* name, /* in: column name */ + ulint order, /* in: order criterion; 0 means an + ascending order */ + ulint prefix_len); /* in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ /************************************************************************** Frees an index memory object. */ @@ -158,12 +161,18 @@ struct dict_col_struct{ in some of the functions below */ }; +#define DICT_MAX_COL_PREFIX_LEN 512 + /* Data structure for a field in an index */ struct dict_field_struct{ - dict_col_t* col; /* pointer to the table column */ - char* name; /* name of the column */ - ulint order; /* flags for ordering this field: - DICT_DESCEND, ... */ + dict_col_t* col; /* pointer to the table column */ + char* name; /* name of the column */ + ulint order; /* flags for ordering this field: + DICT_DESCEND, ... */ + ulint prefix_len; /* 0 or the length of the column + prefix in a MySQL index of type, e.g., + INDEX (textcol(25)); must be smaller + than DICT_MAX_COL_PREFIX_LEN */ }; /* Data structure for an index tree */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index 23ef0304b2d..ad3149f0b36 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -43,7 +43,10 @@ struct fil_addr_struct{ extern fil_addr_t fil_addr_null; /* The byte offsets on a file page for various variables */ -#define FIL_PAGE_SPACE 0 /* space id the page belongs to */ +#define FIL_PAGE_SPACE_OR_CHKSUM 0 /* in < MySQL-4.0.14 space id the + page belongs to (== 0) but in later + versions the 'new' checksum of the + page */ #define FIL_PAGE_OFFSET 4 /* page offset inside space */ #define FIL_PAGE_PREV 8 /* if there is a 'natural' predecessor of the page, its offset */ @@ -64,7 +67,7 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_DATA 38 /* start of the data on the page */ /* File page trailer */ -#define FIL_PAGE_END_LSN 8 /* the low 4 bytes of this are used +#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /* the low 4 bytes of this are used to store the page checksum, the last 4 bytes should be identical to the last 4 bytes of FIL_PAGE_LSN */ @@ -383,6 +386,14 @@ fil_space_release_free_extents( /*===========================*/ ulint id, /* in: space id */ ulint n_reserved); /* in: how many one reserved */ +/*********************************************************************** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ + +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id); /* in: space id */ typedef struct fil_space_struct fil_space_t; diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h index d3b3d55d015..5608ba020b7 100644 --- a/innobase/include/lock0lock.h +++ b/innobase/include/lock0lock.h @@ -450,6 +450,18 @@ lock_rec_get_mutex_for_addr( ulint space, /* in: space id */ ulint page_no);/* in: page number */ /************************************************************************* +Checks that a transaction id is sensible, i.e., not in the future. */ + +ibool +lock_check_trx_id_sanity( +/*=====================*/ + /* out: TRUE if ok */ + dulint trx_id, /* in: trx id */ + rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: clustered index */ + ibool has_kernel_mutex);/* in: TRUE if the caller owns the + kernel mutex */ +/************************************************************************* Validates the lock queue on a single record. */ ibool diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h index 4e1404b15fe..24ec28a56e6 100644 --- a/innobase/include/log0log.h +++ b/innobase/include/log0log.h @@ -173,6 +173,12 @@ log_write_up_to( /* in: TRUE if we want the written log also to be flushed to disk */ /******************************************************************** +Does a syncronous flush of the log buffer to disk. */ + +void +log_buffer_flush_to_disk(void); +/*==========================*/ +/******************************************************************** Advances the smallest lsn for which there are unflushed dirty blocks in the buffer pool and also may make a new checkpoint. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ @@ -507,6 +513,15 @@ log_print( /*======*/ char* buf, /* in/out: buffer where to print */ char* buf_end);/* in: buffer end */ +/********************************************************** +Peeks the current lsn. */ + +ibool +log_peek_lsn( +/*=========*/ + /* out: TRUE if success, FALSE if could not get the + log system mutex */ + dulint* lsn); /* out: if returns TRUE, current lsn is here */ /************************************************************************** Refreshes the statistics used to print per-second averages. */ @@ -779,6 +794,11 @@ struct log_struct{ called */ /* Fields involved in checkpoints */ + ulint log_group_capacity; /* capacity of the log group; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ ulint max_modified_age_async; /* when this recommended value for lsn - buf_pool_get_oldest_modification() diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h index bef42cfec1c..7b27ee34541 100644 --- a/innobase/include/log0recv.h +++ b/innobase/include/log0recv.h @@ -333,6 +333,8 @@ extern ibool recv_recovery_on; extern ibool recv_no_ibuf_operations; extern ibool recv_needed_recovery; +extern ibool recv_lsn_checks_on; + extern ibool recv_is_making_a_backup; extern ulint recv_max_parsed_page_no; diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index 86f27a2d3eb..5c52f0e92bf 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -146,6 +146,21 @@ os_file_create_simple( ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ ibool* success);/* out: TRUE if succeed, FALSE if error */ /******************************************************************** +A simple function to open or create a file. */ + +os_file_t +os_file_create_simple_no_error_handling( +/*====================================*/ + /* out, own: handle to the file, not defined if error, + error number can be retrieved with os_get_last_error */ + char* name, /* in: name of the file or path as a null-terminated + string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened + (if does not exist, error), or OS_FILE_CREATE if a new + file is created (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ +/******************************************************************** Opens an existing file or creates a new. */ os_file_t @@ -160,7 +175,11 @@ os_file_create( file is created (if exists, error), OS_FILE_OVERWRITE if a new file is created or an old overwritten */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o - is desired, OS_FILE_NORMAL, if any normal file */ + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. and srv_.. + variables whether we really use async i/o or + unbuffered i/o: look in the function source code for + the exact rules */ ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success);/* out: TRUE if succeed, FALSE if error */ /*************************************************************************** @@ -173,6 +192,14 @@ os_file_close( /* out: TRUE if success */ os_file_t file); /* in, own: handle to a file */ /*************************************************************************** +Closes a file handle. */ + +ibool +os_file_close_no_error_handling( +/*============================*/ + /* out: TRUE if success */ + os_file_t file); /* in, own: handle to a file */ +/*************************************************************************** Gets a file size. */ ibool diff --git a/innobase/include/os0sync.h b/innobase/include/os0sync.h index 634507467f9..e1cf263216e 100644 --- a/innobase/include/os0sync.h +++ b/innobase/include/os0sync.h @@ -36,8 +36,12 @@ typedef os_event_struct_t* os_event_t; struct os_event_struct { os_fast_mutex_t os_mutex; /* this mutex protects the next fields */ - ibool is_set; /* this is TRUE if the next mutex is - not reserved */ + ibool is_set; /* this is TRUE when the event is + in the signaled state, i.e., a thread + does not stop if it tries to wait for + this event */ + ib_longlong signal_count; /* this is incremented each time + the event becomes signaled */ pthread_cond_t cond_var; /* condition variable is used in waiting for the event */ UT_LIST_NODE_T(os_event_struct_t) os_event_list; diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index b5e33af5bc0..04f771c3abd 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -666,6 +666,15 @@ page_rec_validate( /* out: TRUE if ok */ rec_t* rec); /* in: record on the page */ /******************************************************************* +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ + +void +page_check_dir( +/*===========*/ + page_t* page); /* in: index page */ +/******************************************************************* This function checks the consistency of an index page when we do not know the index. This is also resilient so that this should never crash even if the page is total garbage. */ diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h index 6f2a99fc8c2..712e263350e 100644 --- a/innobase/include/rem0cmp.h +++ b/innobase/include/rem0cmp.h @@ -42,6 +42,22 @@ cmp_data_data( buffer) */ ulint len2); /* in: data field length or UNIV_SQL_NULL */ /***************************************************************** +This function is used to compare two data fields for which we know the +data type. */ + +int +cmp_data_data_slow( +/*===============*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + dtype_t* cur_type,/* in: data type of the fields */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** This function is used to compare two dfields where at least the first has its data type field set. */ UNIV_INLINE diff --git a/innobase/include/row0mysql.ic b/innobase/include/row0mysql.ic index e9d493da8b5..4ecd66e06ec 100644 --- a/innobase/include/row0mysql.ic +++ b/innobase/include/row0mysql.ic @@ -58,7 +58,8 @@ row_mysql_store_col_in_innobase_format( /*===================================*/ dfield_t* dfield, /* in/out: dfield */ byte* buf, /* in/out: buffer for the converted - value */ + value; this must be at least col_len + long! */ byte* mysql_data, /* in: MySQL column value, not SQL NULL; NOTE that dfield may also get a pointer to mysql_data, @@ -96,7 +97,6 @@ row_mysql_store_col_in_innobase_format( while (col_len > 0 && ptr[col_len - 1] == ' ') { col_len--; } - } else if (type == DATA_BLOB) { ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); } diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h index 09a79e19fd7..d1befbbbad3 100644 --- a/innobase/include/row0row.h +++ b/innobase/include/row0row.h @@ -86,9 +86,10 @@ dtuple_t* row_build( /*======*/ /* out, own: row built; see the NOTE below! */ - ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: - the former copies also the data fields to - heap as the latter only places pointers to + ulint type, /* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or + ROW_COPY_ALSO_EXTERNALS, + the two last copy also the data fields to + heap as the first only places pointers to data fields on the index page, and thus is more efficient */ dict_index_t* index, /* in: clustered index */ diff --git a/innobase/include/row0sel.h b/innobase/include/row0sel.h index cfc30852b87..a35d588ad08 100644 --- a/innobase/include/row0sel.h +++ b/innobase/include/row0sel.h @@ -87,9 +87,11 @@ row_printf_step( /* out: query thread to run next or NULL */ que_thr_t* thr); /* in: query thread */ /******************************************************************** -Converts a key value stored in MySQL format to an Innobase dtuple. -The last field of the key value may be just a prefix of a fixed length -field: hence the parameter key_len. */ +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ void row_sel_convert_mysql_key_to_innobase( @@ -100,6 +102,7 @@ row_sel_convert_mysql_key_to_innobase( to index! */ byte* buf, /* in: buffer to use in field conversions */ + ulint buf_len, /* in: buffer length */ dict_index_t* index, /* in: index of the key value */ byte* key_ptr, /* in: MySQL key value */ ulint key_len); /* in: MySQL key value length */ diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h index 273ec6074eb..473c55c7ef9 100644 --- a/innobase/include/row0upd.h +++ b/innobase/include/row0upd.h @@ -114,13 +114,15 @@ row_upd_index_write_log( closed within this function */ mtr_t* mtr); /* in: mtr into whose log to write */ /*************************************************************** -Returns TRUE if row update changes size of some field in index. */ +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. */ ibool -row_upd_changes_field_size( -/*=======================*/ +row_upd_changes_field_size_or_external( +/*===================================*/ /* out: TRUE if the update changes the size of - some field in index */ + some field in index or the field is external + in rec or update */ rec_t* rec, /* in: record in clustered index */ dict_index_t* index, /* in: clustered index */ upd_t* update);/* in: update vector */ @@ -175,16 +177,10 @@ row_upd_index_replace_new_col_vals( dtuple_t* entry, /* in/out: index entry where replaced */ dict_index_t* index, /* in: index; NOTE that may also be a non-clustered index */ - upd_t* update); /* in: update vector */ -/*************************************************************** -Replaces the new column values stored in the update vector to the -clustered index entry given. */ - -void -row_upd_clust_index_replace_new_col_vals( -/*=====================================*/ - dtuple_t* entry, /* in/out: index entry where replaced */ - upd_t* update); /* in: update vector */ + upd_t* update, /* in: update vector */ + mem_heap_t* heap); /* in: memory heap to which we allocate and + copy the new values, set this as NULL if you + do not want allocation */ /*************************************************************** Checks if an update vector changes an ordering field of an index record. This function is fast if the update vector is short or the number of ordering @@ -358,9 +354,9 @@ struct upd_node_struct{ externally in the clustered index record of row */ ulint n_ext_vec;/* number of fields in ext_vec */ - mem_heap_t* heap; /* memory heap used as auxiliary storage for - row; this must be emptied after a successful - update if node->row != NULL */ + mem_heap_t* heap; /* memory heap used as auxiliary storage; + this must be emptied after a successful + update */ /*----------------------*/ sym_node_t* table_sym;/* table node in symbol table */ que_node_t* col_assign_list; diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 8fd0fc2dd6d..87643e87a68 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -156,6 +156,7 @@ extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs, /* Array of English strings describing the current state of an i/o handler thread */ extern char* srv_io_thread_op_info[]; +extern char* srv_io_thread_function[]; typedef struct srv_sys_struct srv_sys_t; @@ -170,6 +171,7 @@ what these mean */ #define SRV_UNIX_O_DSYNC 2 #define SRV_UNIX_LITTLESYNC 3 #define SRV_UNIX_NOSYNC 4 +#define SRV_UNIX_O_DIRECT 5 /* Alternatives for file i/o in Windows */ #define SRV_WIN_IO_NORMAL 1 diff --git a/innobase/include/trx0roll.h b/innobase/include/trx0roll.h index 820af4cd014..0d7126c9c57 100644 --- a/innobase/include/trx0roll.h +++ b/innobase/include/trx0roll.h @@ -177,6 +177,55 @@ trx_general_rollback_for_mysql( ibool partial,/* in: TRUE if partial rollback requested */ trx_savept_t* savept);/* in: pointer to savepoint undo number, if partial rollback requested */ +/*********************************************************************** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + char* savepoint_name, /* in: savepoint name */ + ib_longlong* mysql_binlog_cache_pos);/* out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +/*********************************************************************** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. */ + +ulint +trx_savepoint_for_mysql( +/*====================*/ + /* out: always DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + char* savepoint_name, /* in: savepoint name */ + ib_longlong binlog_cache_pos); /* in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +/*********************************************************************** +Frees savepoint structs. */ + +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep); /* in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ extern sess_t* trx_dummy_sess; @@ -207,6 +256,21 @@ struct roll_node_struct{ case of a partial rollback */ }; +/* A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ +struct trx_named_savept_struct{ + char* name; /* savepoint name */ + trx_savept_t savept; /* the undo number corresponding to + the savepoint */ + ib_longlong mysql_binlog_cache_pos; + /* the MySQL binlog cache position + corresponding to this savepoint, not + defined if the MySQL binlogging is not + enabled */ + UT_LIST_NODE_T(trx_named_savept_t) + trx_savepoints; /* the list of savepoints of a + transaction */ +}; + /* Rollback node states */ #define ROLL_NODE_SEND 1 #define ROLL_NODE_WAIT 2 diff --git a/innobase/include/trx0sys.ic b/innobase/include/trx0sys.ic index ada2d8cb19c..343e6d7c2fa 100644 --- a/innobase/include/trx0sys.ic +++ b/innobase/include/trx0sys.ic @@ -296,6 +296,16 @@ trx_is_active( return(FALSE); } + if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) { + + /* There must be corruption: we return TRUE because this + function is only called by lock_clust_rec_some_has_impl() + and row_vers_impl_x_locked_off_kernel() and they have + diagnostic prints in this case */ + + return(TRUE); + } + trx = trx_get_on_id(trx_id); if (trx && (trx->conc_state == TRX_ACTIVE)) { diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 39229923375..6b08b674db8 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -381,7 +381,8 @@ struct trx_struct{ replication slave, we have here the master binlog name up to which replication has processed; otherwise - this is a pointer to a null character */ + this is a pointer to a null + character */ ib_longlong mysql_master_log_pos; /* if the database server is a MySQL replication slave, this is the @@ -501,6 +502,10 @@ struct trx_struct{ mem_heap_t* read_view_heap; /* memory heap for the read view */ read_view_t* read_view; /* consistent read view or NULL */ /*------------------------------*/ + UT_LIST_BASE_NODE_T(trx_named_savept_t) + trx_savepoints; /* savepoints set with SAVEPOINT ..., + oldest first */ + /*------------------------------*/ mutex_t undo_mutex; /* mutex protecting the fields in this section (down to undo_no_arr), EXCEPT last_sql_stat_start, which can be diff --git a/innobase/include/trx0types.h b/innobase/include/trx0types.h index b8befe7172f..2965eb4451f 100644 --- a/innobase/include/trx0types.h +++ b/innobase/include/trx0types.h @@ -24,6 +24,7 @@ typedef struct trx_undo_inf_struct trx_undo_inf_t; typedef struct trx_purge_struct trx_purge_t; typedef struct roll_node_struct roll_node_t; typedef struct commit_node_struct commit_node_t; +typedef struct trx_named_savept_struct trx_named_savept_t; /* Transaction savepoint */ typedef struct trx_savept_struct trx_savept_t; diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h index e99dc8c09d6..802557099fc 100644 --- a/innobase/include/ut0dbg.h +++ b/innobase/include/ut0dbg.h @@ -50,6 +50,37 @@ extern ulint* ut_dbg_null_ptr; }\ } +/* This can be used if there are % characters in the assertion formula: +if we try to printf the formula gcc would complain of illegal print +format characters */ +#define ut_anp(EXPR)\ +{\ + ulint dbg_i;\ +\ + if (!((ulint)(EXPR) + ut_dbg_zero)) {\ + ut_print_timestamp(stderr);\ + fprintf(stderr,\ + " InnoDB: Assertion failure in thread %lu in file %s line %lu\n",\ + os_thread_pf(os_thread_get_curr_id()), IB__FILE__,\ + (ulint)__LINE__);\ + fprintf(stderr,\ + "\nInnoDB: We intentionally generate a memory trap.\n");\ + fprintf(stderr,\ + "InnoDB: Send a detailed bug report to mysql@lists.mysql.com\n");\ + ut_dbg_stop_threads = TRUE;\ + dbg_i = *(ut_dbg_null_ptr);\ + if (dbg_i) {\ + ut_dbg_null_ptr = NULL;\ + }\ + }\ + if (ut_dbg_stop_threads) {\ + fprintf(stderr,\ + "InnoDB: Thread %lu stopped in file %s line %lu\n",\ + os_thread_pf(os_thread_get_curr_id()), IB__FILE__, (ulint)__LINE__);\ + os_thread_sleep(1000000000);\ + }\ +} + #define ut_error {\ ulint dbg_i;\ ut_print_timestamp(stderr);\ diff --git a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h index d3d04d58596..ba6905a8618 100644 --- a/innobase/include/ut0mem.h +++ b/innobase/include/ut0mem.h @@ -67,7 +67,7 @@ ut_free( /*====*/ void* ptr); /* in, own: memory block */ /************************************************************************** -Frees all allocated memory not freed yet. */ +Frees in shutdown all allocated memory not freed yet. */ void ut_free_all_mem(void); diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 74dc4aea515..af589cd9441 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -83,10 +83,6 @@ x-lock also has an explicit non-gap record x-lock. Therefore, as locks are released, we can grant locks to waiting lock requests purely by looking at the explicit lock requests in the queue. -RULE 2: Granted non-gap locks on a record are always ahead in the queue -------- -of waiting non-gap locks on a record. - RULE 3: Different transactions cannot have conflicting granted non-gap locks ------- on a record at the same time. However, they can have conflicting granted gap @@ -356,7 +352,7 @@ lock_mutex_enter_kernel(void) } /************************************************************************* -Releses the kernel mutex. This function is used in this module to allow +Releases the kernel mutex. This function is used in this module to allow monitoring the contention degree on the kernel mutex caused by the lock operations. */ UNIV_INLINE @@ -515,6 +511,53 @@ lock_rec_mutex_own_all(void) #endif /************************************************************************* +Checks that a transaction id is sensible, i.e., not in the future. */ + +ibool +lock_check_trx_id_sanity( +/*=====================*/ + /* out: TRUE if ok */ + dulint trx_id, /* in: trx id */ + rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: clustered index */ + ibool has_kernel_mutex)/* in: TRUE if the caller owns the + kernel mutex */ +{ + char err_buf[500]; + ibool is_ok = TRUE; + + if (!has_kernel_mutex) { + mutex_enter(&kernel_mutex); + } + + /* A sanity check: the trx_id in rec must be smaller than the global + trx id counter */ + + if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) { + rec_sprintf(err_buf, 400, rec); + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Error: transaction id associated with record\n%s\n" +"InnoDB: in table %s index %s\n" +"InnoDB: is %lu %lu which is higher than the global trx id counter %lu %lu!\n" +"InnoDB: The table is corrupt. You have to do dump + drop + reimport.\n", + err_buf, index->table_name, index->name, + ut_dulint_get_high(trx_id), + ut_dulint_get_low(trx_id), + ut_dulint_get_high(trx_sys->max_trx_id), + ut_dulint_get_low(trx_sys->max_trx_id)); + + is_ok = FALSE; + } + + if (!has_kernel_mutex) { + mutex_exit(&kernel_mutex); + } + + return(is_ok); +} + +/************************************************************************* Checks that a record is seen in a consistent read. */ ibool @@ -532,6 +575,10 @@ lock_clust_rec_cons_read_sees( ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); + /* NOTE that we call this function while holding the search + system latch. To obey the latching order we must NOT reserve the + kernel mutex here! */ + trx_id = row_get_rec_trx_id(rec, index); if (read_view_sees_trx_id(view, trx_id)) { @@ -562,10 +609,16 @@ lock_sec_rec_cons_read_sees( read_view_t* view) /* in: consistent read view */ { dulint max_trx_id; - + + UT_NOT_USED(index); + ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec)); + /* NOTE that we might call this function while holding the search + system latch. To obey the latching order we must NOT reserve the + kernel mutex here! */ + if (recv_recovery_is_on()) { return(FALSE); @@ -1569,6 +1622,15 @@ lock_sec_rec_some_has_impl_off_kernel( /* Ok, in this case it is possible that some transaction has an implicit x-lock. We have to look in the clustered index. */ + if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), rec, index, + TRUE)) { + buf_page_print(page); + + /* The page is corrupt: try to avoid a crash by returning + NULL */ + return(NULL); + } + return(row_vers_impl_x_locked_off_kernel(rec, index)); } @@ -2565,7 +2627,7 @@ lock_move_rec_list_start( ulint heap_no; ulint type_mode; - ut_ad(new_page); + ut_a(new_page); lock_mutex_enter_kernel(); @@ -3028,7 +3090,7 @@ lock_deadlock_recursive( we return LOCK_VICTIM_IS_START */ { lock_t* lock; - ulint bit_no; + ulint bit_no = ULINT_UNDEFINED; trx_t* lock_trx; char* err_buf; ulint ret; @@ -3067,6 +3129,7 @@ lock_deadlock_recursive( lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); } else { ut_ad(lock_get_type(lock) == LOCK_REC); + ut_a(bit_no != ULINT_UNDEFINED); lock = lock_rec_get_prev(lock, bit_no); } @@ -4205,7 +4268,6 @@ lock_rec_queue_validate( { trx_t* impl_trx; lock_t* lock; - ibool is_waiting; ut_a(rec); @@ -4266,8 +4328,6 @@ lock_rec_queue_validate( } } - is_waiting = FALSE; - lock = lock_rec_get_first(rec); while (lock) { @@ -4280,8 +4340,6 @@ lock_rec_queue_validate( } if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) { - - ut_a(!is_waiting); if (lock_get_mode(lock) == LOCK_S) { ut_a(!lock_rec_other_has_expl_req(LOCK_X, @@ -4293,7 +4351,6 @@ lock_rec_queue_validate( } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { - is_waiting = TRUE; ut_a(lock_rec_has_to_wait_in_queue(lock)); } diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index 6892e6fc6c1..b0140ef767b 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -33,6 +33,11 @@ log_t* log_sys = NULL; ibool log_do_write = TRUE; ibool log_debug_writes = FALSE; +/* These control how often we print warnings if the last checkpoint is too +old */ +ibool log_has_printed_chkp_warning = FALSE; +time_t log_last_warning_time; + /* Pointer to this variable is used as the i/o-message when we do i/o to an archive */ byte log_archive_io; @@ -178,7 +183,8 @@ loop: /* Not enough free space, do a syncronous flush of the log buffer */ - log_write_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS, TRUE); + + log_buffer_flush_to_disk(); count++; @@ -298,6 +304,7 @@ log_close(void) dulint oldest_lsn; dulint lsn; log_t* log = log_sys; + ulint checkpoint_age; ut_ad(mutex_own(&(log->mutex))); @@ -321,8 +328,34 @@ log_close(void) log->check_flush_or_checkpoint = TRUE; } - if (ut_dulint_minus(lsn, log->last_checkpoint_lsn) - <= log->max_modified_age_async) { + checkpoint_age = ut_dulint_minus(lsn, log->last_checkpoint_lsn); + + if (checkpoint_age >= log->log_group_capacity) { + /* TODO: split btr_store_big_rec_extern_fields() into small + steps so that we can release all latches in the middle, and + call log_free_check() to ensure we never write over log written + after the latest checkpoint. In principle, we should split all + big_rec operations, but other operations are smaller. */ + + if (!log_has_printed_chkp_warning + || difftime(time(NULL), log_last_warning_time) > 15) { + + log_has_printed_chkp_warning = TRUE; + log_last_warning_time = time(NULL); + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: ERROR: the age of the last checkpoint is %lu,\n" +"InnoDB: which exceeds the log group capacity %lu.\n" +"InnoDB: If you are using big BLOB or TEXT rows, you must set the\n" +"InnoDB: combined size of log files at least 10 times bigger than the\n" +"InnoDB: largest such row.\n", + checkpoint_age, log->log_group_capacity); + } + } + + if (checkpoint_age <= log->max_modified_age_async) { + goto function_exit; } @@ -331,8 +364,7 @@ log_close(void) if (ut_dulint_is_zero(oldest_lsn) || (ut_dulint_minus(lsn, oldest_lsn) > log->max_modified_age_async) - || (ut_dulint_minus(lsn, log->last_checkpoint_lsn) - > log->max_checkpoint_age_async)) { + || checkpoint_age > log->max_checkpoint_age_async) { log->check_flush_or_checkpoint = TRUE; } @@ -375,7 +407,7 @@ log_pad_current_log_block(void) log_close(); log_release(); - ut_ad((ut_dulint_get_low(lsn) % OS_FILE_LOG_BLOCK_SIZE) + ut_anp((ut_dulint_get_low(lsn) % OS_FILE_LOG_BLOCK_SIZE) == LOG_BLOCK_HDR_SIZE); } @@ -468,7 +500,7 @@ log_group_calc_lsn_offset( offset = (gr_lsn_size_offset + difference) % group_size; - ut_a(offset <= 0xFFFFFFFF); + ut_a(offset < (((ib_longlong) 1) << 32)); /* offset must be < 4 GB */ /* printf("Offset is %lu gr_lsn_offset is %lu difference is %lu\n", (ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference); @@ -550,7 +582,6 @@ log_calc_max_ages(void) the database server */ { log_group_t* group; - ulint n_threads; ulint margin; ulint free; ibool success = TRUE; @@ -560,8 +591,6 @@ log_calc_max_ages(void) ut_ad(!mutex_own(&(log_sys->mutex))); - n_threads = srv_get_n_threads(); - mutex_enter(&(log_sys->mutex)); group = UT_LIST_GET_FIRST(log_sys->log_groups); @@ -589,12 +618,15 @@ log_calc_max_ages(void) group = UT_LIST_GET_NEXT(log_groups, group); } + /* Add extra safety */ + smallest_capacity = smallest_capacity - smallest_capacity / 10; + /* For each OS thread we must reserve so much free space in the smallest log group that it can accommodate the log entries produced by single query steps: running out of free log space is a serious system error which requires rebooting the database. */ - free = LOG_CHECKPOINT_FREE_PER_THREAD * n_threads + free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency) + LOG_CHECKPOINT_EXTRA_FREE; if (free >= smallest_capacity / 2) { success = FALSE; @@ -606,6 +638,10 @@ log_calc_max_ages(void) margin = ut_min(margin, log_sys->adm_checkpoint_interval); + margin = margin - margin / 10; /* Add still some extra safety */ + + log_sys->log_group_capacity = smallest_capacity; + log_sys->max_modified_age_async = margin - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC; log_sys->max_modified_age_sync = margin @@ -625,7 +661,7 @@ failure: if (!success) { fprintf(stderr, - "Error: log file group too small for the number of threads\n"); +"InnoDB: Error: log file group too small for innodb_thread_concurrency\n"); } return(success); @@ -1070,8 +1106,8 @@ log_group_write_buf( ulint i; ut_ad(mutex_own(&(log_sys->mutex))); - ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_ad(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_anp(len % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_anp(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); if (new_data_offset == 0) { write_header = TRUE; @@ -1365,6 +1401,24 @@ do_waits: } /******************************************************************** +Does a syncronous flush of the log buffer to disk. */ + +void +log_buffer_flush_to_disk(void) +/*==========================*/ +{ + dulint lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE); +} + +/******************************************************************** Tries to establish a big enough margin of free space in the log buffer, such that a new log entry can be catenated without an immediate need for a flush. */ static @@ -1374,6 +1428,7 @@ log_flush_margin(void) { ibool do_flush = FALSE; log_t* log = log_sys; + dulint lsn; mutex_enter(&(log->mutex)); @@ -1384,13 +1439,14 @@ log_flush_margin(void) free space */ } else { do_flush = TRUE; + lsn = log->lsn; } } mutex_exit(&(log->mutex)); if (do_flush) { - log_write_up_to(ut_dulint_max, LOG_NO_WAIT, FALSE); + log_write_up_to(lsn, LOG_NO_WAIT, FALSE); } } @@ -2123,11 +2179,11 @@ log_group_archive( start_lsn = log_sys->archived_lsn; - ut_ad(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_anp(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); end_lsn = log_sys->next_archived_lsn; - ut_ad(ut_dulint_get_low(end_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_anp(ut_dulint_get_low(end_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); buf = log_sys->archive_buf; @@ -2234,7 +2290,7 @@ loop: group->next_archived_file_no = group->archived_file_no + n_files; group->next_archived_offset = next_offset % group->file_size; - ut_ad(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_anp(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0); } /********************************************************* @@ -2429,8 +2485,8 @@ loop: start_lsn = log_sys->archived_lsn; if (calc_new_limit) { - ut_ad(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0); - + ut_anp(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE + == 0); limit_lsn = ut_dulint_add(start_lsn, log_sys->archive_buf_size); @@ -2916,6 +2972,7 @@ loop: mutex_enter(&kernel_mutex); + /* Check that there are no longer transactions */ if (trx_n_mysql_transactions > 0 || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { @@ -2924,6 +2981,8 @@ loop: goto loop; } + /* Check that the master thread is suspended */ + if (srv_n_threads_active[SRV_MASTER] != 0) { mutex_exit(&kernel_mutex); @@ -2952,7 +3011,6 @@ loop: } log_archive_all(); - log_make_checkpoint_at(ut_dulint_max, TRUE); mutex_enter(&(log_sys->mutex)); @@ -2961,8 +3019,9 @@ loop: if (ut_dulint_cmp(lsn, log_sys->last_checkpoint_lsn) != 0 || (srv_log_archive_on - && ut_dulint_cmp(lsn, - ut_dulint_add(log_sys->archived_lsn, LOG_BLOCK_HDR_SIZE)) != 0)) { + && ut_dulint_cmp(lsn, + ut_dulint_add(log_sys->archived_lsn, LOG_BLOCK_HDR_SIZE)) + != 0)) { mutex_exit(&(log_sys->mutex)); @@ -2981,10 +3040,22 @@ loop: mutex_exit(&(log_sys->mutex)); + mutex_enter(&kernel_mutex); + /* Check that the master thread has stayed suspended */ + if (srv_n_threads_active[SRV_MASTER] != 0) { + fprintf(stderr, +"InnoDB: Warning: the master thread woke up during shutdown\n"); + + mutex_exit(&kernel_mutex); + + goto loop; + } + mutex_exit(&kernel_mutex); + fil_flush_file_spaces(FIL_TABLESPACE); fil_flush_file_spaces(FIL_LOG); - /* The following fil_write_... will pass the buffer pool: therefore + /* The next fil_write_... will pass the buffer pool: therefore it is essential that the buffer pool has been completely flushed to disk! */ @@ -2993,12 +3064,14 @@ loop: goto loop; } + /* The lock timeout thread should now have exited */ + if (srv_lock_timeout_and_monitor_active) { goto loop; } - /* We now suspend also the InnoDB error monitor thread */ + /* We now let also the InnoDB error monitor thread to exit */ srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; @@ -3008,6 +3081,7 @@ loop: } /* Make some checks that the server really is quiet */ + ut_a(srv_n_threads_active[SRV_MASTER] == 0); ut_a(buf_all_freed()); ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn)); @@ -3016,6 +3090,7 @@ loop: fil_flush_file_spaces(FIL_TABLESPACE); /* Make some checks that the server really is quiet */ + ut_a(srv_n_threads_active[SRV_MASTER] == 0); ut_a(buf_all_freed()); ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn)); } @@ -3072,6 +3147,28 @@ log_check_log_recs( } /********************************************************** +Peeks the current lsn. */ + +ibool +log_peek_lsn( +/*=========*/ + /* out: TRUE if success, FALSE if could not get the + log system mutex */ + dulint* lsn) /* out: if returns TRUE, current lsn is here */ +{ + if (0 == mutex_enter_nowait(&(log_sys->mutex), (char*)__FILE__, + __LINE__)) { + *lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + return(TRUE); + } + + return(FALSE); +} + +/********************************************************** Prints info of the log. */ void diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index 4efe4e7b23d..ce90683ae7f 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -46,6 +46,8 @@ ibool recv_recovery_from_backup_on = FALSE; ibool recv_needed_recovery = FALSE; +ibool recv_lsn_checks_on = FALSE; + /* If the following is TRUE, the buffer pool file pages must be invalidated after recovery and no ibuf operations are allowed; this becomes TRUE if the log record hash table becomes too full, and log records must be merged @@ -71,6 +73,12 @@ ulint recv_previous_parsed_rec_is_multi = 0; ulint recv_max_parsed_page_no = 0; +/* The maximum lsn we see for a page during the recovery process. If this +is bigger than the lsn we are able to scan up to, that is an indication that +the recovery failed and the database may be corrupt. */ + +dulint recv_max_page_lsn; + /* This many frames must be left free in the buffer pool when we scan the log and store the scanned log records in the buffer pool: we will use these free frames to read in pages when we start applying the @@ -140,6 +148,8 @@ recv_sys_init( OS_FILE_LOG_BLOCK_SIZE); recv_sys->found_corrupt_log = FALSE; + recv_max_page_lsn = ut_dulint_zero; + mutex_exit(&(recv_sys->mutex)); } @@ -981,7 +991,7 @@ recv_recover_page( ulint space, /* in: space id */ ulint page_no) /* in: page number */ { - buf_block_t* block; + buf_block_t* block = NULL; recv_addr_t* recv_addr; recv_t* recv; byte* buf; @@ -1093,7 +1103,7 @@ recv_recover_page( page_lsn = page_newest_lsn; mach_write_to_8(page + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN, ut_dulint_zero); + - FIL_PAGE_END_LSN_OLD_CHKSUM, ut_dulint_zero); mach_write_to_8(page + FIL_PAGE_LSN, ut_dulint_zero); } @@ -1115,7 +1125,7 @@ recv_recover_page( recv_parse_or_apply_log_rec_body(recv->type, buf, buf + recv->len, page, &mtr); mach_write_to_8(page + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN, + - FIL_PAGE_END_LSN_OLD_CHKSUM, ut_dulint_add(recv->start_lsn, recv->len)); mach_write_to_8(page + FIL_PAGE_LSN, @@ -1132,6 +1142,10 @@ recv_recover_page( mutex_enter(&(recv_sys->mutex)); + if (ut_dulint_cmp(recv_max_page_lsn, page_lsn) < 0) { + recv_max_page_lsn = page_lsn; + } + recv_addr->state = RECV_PROCESSED; ut_a(recv_sys->n_addrs); @@ -1140,6 +1154,8 @@ recv_recover_page( mutex_exit(&(recv_sys->mutex)); if (!recover_backup && modification_to_page) { + ut_a(block); + buf_flush_recv_note_modification(block, start_lsn, end_lsn); } @@ -1347,6 +1363,7 @@ loop: mutex_exit(&(recv_sys->mutex)); } +#ifdef UNIV_HOTBACKUP /*********************************************************************** Applies log records in the hash table to a backup. */ @@ -1528,8 +1545,8 @@ recv_check_identical( for (i = 0; i < len; i++) { if (str1[i] != str2[i]) { - fprintf(stderr, "Strings do not match at offset %lu\n", i); - + fprintf(stderr, + "Strings do not match at offset %lu\n", i); ut_print_buf(str1 + i, 16); fprintf(stderr, "\n"); ut_print_buf(str2 + i, 16); @@ -1662,6 +1679,7 @@ recv_compare_spaces_low( recv_compare_spaces(space1, space2, n_pages); } +#endif /*********************************************************************** Tries to parse a single log record and returns its length. */ @@ -2196,9 +2214,12 @@ recv_scan_log_recs( while (log_block < buf + len && !finished) { no = log_block_get_hdr_no(log_block); +/* + fprintf(stderr, "Log block header no %lu\n", no); - /* fprintf(stderr, "Log block header no %lu\n", no); */ - + fprintf(stderr, "Scanned lsn no %lu\n", + log_block_convert_lsn_to_no(scanned_lsn)); +*/ if (no != log_block_convert_lsn_to_no(scanned_lsn) || !log_block_checksum_is_ok_or_old_format(log_block)) { @@ -2590,7 +2611,6 @@ recv_recovery_from_checkpoint_start( recv_group_scan_log_recs(group, &contiguous_lsn, &group_scanned_lsn); - group->scanned_lsn = group_scanned_lsn; if (ut_dulint_cmp(old_scanned_lsn, group_scanned_lsn) < 0) { @@ -2607,6 +2627,31 @@ recv_recovery_from_checkpoint_start( group = UT_LIST_GET_NEXT(log_groups, group); } + /* We currently have only one log group */ + if (ut_dulint_cmp(group_scanned_lsn, checkpoint_lsn) < 0) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: ERROR: We were only able to scan the log up to\n" +"InnoDB: %lu %lu, but a checkpoint was at %lu %lu.\n" +"InnoDB: It is possible that the database is now corrupt!\n", + ut_dulint_get_high(group_scanned_lsn), + ut_dulint_get_low(group_scanned_lsn), + ut_dulint_get_high(checkpoint_lsn), + ut_dulint_get_low(checkpoint_lsn)); + } + + if (ut_dulint_cmp(group_scanned_lsn, recv_max_page_lsn) < 0) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: ERROR: We were only able to scan the log up to %lu %lu\n" +"InnoDB: but a database page a had an lsn %lu %lu. It is possible that the\n" +"InnoDB: database is now corrupt!\n", + ut_dulint_get_high(group_scanned_lsn), + ut_dulint_get_low(group_scanned_lsn), + ut_dulint_get_high(recv_max_page_lsn), + ut_dulint_get_low(recv_max_page_lsn)); + } + if (ut_dulint_cmp(recv_sys->recovered_lsn, checkpoint_lsn) < 0) { mutex_exit(&(log_sys->mutex)); @@ -2660,6 +2705,8 @@ recv_recovery_from_checkpoint_start( sync_order_checks_on = FALSE; + recv_lsn_checks_on = TRUE; + /* The database is now ready to start almost normal processing of user transactions: transaction rollbacks and the application of the log records in the hash table can be run in background. */ diff --git a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c index 382e505b63f..b004a8c4df7 100644 --- a/innobase/mem/mem0pool.c +++ b/innobase/mem/mem0pool.c @@ -99,6 +99,12 @@ mem_pool_t* mem_comm_pool = NULL; ulint mem_out_of_mem_err_msg_count = 0; +/* We use this counter to check that the mem pool mutex does not leak; +this is to track a strange assertion failure reported at +mysql@lists.mysql.com */ + +ulint mem_n_threads_inside = 0; + /************************************************************************ Reserves the mem pool mutex. */ @@ -328,6 +334,9 @@ mem_area_alloc( n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE)); mutex_enter(&(pool->mutex)); + mem_n_threads_inside++; + + ut_a(mem_n_threads_inside == 1); area = UT_LIST_GET_FIRST(pool->free_list[n]); @@ -338,6 +347,7 @@ mem_area_alloc( /* Out of memory in memory pool: we try to allocate from the operating system with the regular malloc: */ + mem_n_threads_inside--; mutex_exit(&(pool->mutex)); return(ut_malloc(size)); @@ -353,6 +363,16 @@ mem_area_alloc( n); mem_analyze_corruption((byte*)area); + + /* Try to analyze a strange assertion failure reported at + mysql@lists.mysql.com where the free bit IS 1 in the + hex dump above */ + + if (mem_area_get_free(area)) { + fprintf(stderr, +"InnoDB: Probably a race condition because now the area is marked free!\n"); + } + ut_a(0); } @@ -374,6 +394,7 @@ mem_area_alloc( pool->reserved += mem_area_get_size(area); + mem_n_threads_inside--; mutex_exit(&(pool->mutex)); ut_ad(mem_pool_validate(pool)); @@ -495,6 +516,9 @@ mem_area_free( n = ut_2_log(size); mutex_enter(&(pool->mutex)); + mem_n_threads_inside++; + + ut_a(mem_n_threads_inside == 1); if (buddy && mem_area_get_free(buddy) && (size == mem_area_get_size(buddy))) { @@ -518,6 +542,7 @@ mem_area_free( pool->reserved += ut_2_exp(n); + mem_n_threads_inside--; mutex_exit(&(pool->mutex)); mem_area_free(new_ptr, pool); @@ -533,6 +558,7 @@ mem_area_free( pool->reserved -= size; } + mem_n_threads_inside--; mutex_exit(&(pool->mutex)); ut_ad(mem_pool_validate(pool)); @@ -577,7 +603,7 @@ mem_pool_validate( } } - ut_a(free + pool->reserved == pool->size + ut_anp(free + pool->reserved == pool->size - (pool->size % MEM_AREA_MIN_SIZE)); mutex_exit(&(pool->mutex)); diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index e401503c4e3..e31fd1d9efe 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -60,6 +60,7 @@ struct os_aio_slot_struct{ ulint pos; /* index of the slot in the aio array */ ibool reserved; /* TRUE if this slot is reserved */ + time_t reservation_time;/* time when reserved */ ulint len; /* length of the block to read or write */ byte* buf; /* buffer used in i/o */ @@ -147,6 +148,12 @@ time_t os_last_printout; ibool os_has_said_disk_full = FALSE; +/* The mutex protecting the following counts of pending pread and pwrite +operations */ +os_mutex_t os_file_count_mutex; +ulint os_file_n_pending_preads = 0; +ulint os_file_n_pending_pwrites = 0; + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -204,9 +211,9 @@ os_file_get_last_error(void) if (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %li in a file operation.\n" + " InnoDB: Operating system error number %lu in a file operation.\n" "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n", - (long) err); + err); if (err == ERROR_PATH_NOT_FOUND) { fprintf(stderr, @@ -248,9 +255,9 @@ os_file_get_last_error(void) ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %li in a file operation.\n" + " InnoDB: Operating system error number %lu in a file operation.\n" "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n", - (long) err); + err); if (err == ENOENT) { fprintf(stderr, @@ -344,7 +351,8 @@ os_file_handle_error( fprintf(stderr, "InnoDB: File name %s\n", name); } - fprintf(stderr, "InnoDB: System call %s.\n", operation); + fprintf(stderr, "InnoDB: File operation call: '%s'.\n", + operation); fprintf(stderr, "InnoDB: Cannot continue operation.\n"); fflush(stderr); @@ -364,6 +372,8 @@ os_io_init_simple(void) { ulint i; + os_file_count_mutex = os_mutex_create(NULL); + for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { os_file_seek_mutexes[i] = os_mutex_create(NULL); } @@ -415,9 +425,8 @@ try_again: file = CreateFile(name, access, - FILE_SHARE_READ | FILE_SHARE_WRITE, - /* file can be read and written - also by other processes */ + FILE_SHARE_READ,/* file can be read also by other + processes */ NULL, /* default security attributes */ create_flag, attributes, @@ -481,6 +490,101 @@ try_again: return(file); #endif } + +/******************************************************************** +A simple function to open or create a file. */ + +os_file_t +os_file_create_simple_no_error_handling( +/*====================================*/ + /* out, own: handle to the file, not defined if error, + error number can be retrieved with os_get_last_error */ + char* name, /* in: name of the file or path as a null-terminated + string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened + (if does not exist, error), or OS_FILE_CREATE if a new + file is created (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + access = 0; + ut_error; + } + + file = CreateFile(name, + access, + FILE_SHARE_READ,/* file can be read also by other + processes */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /* no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + } else { + *success = TRUE; + } + + return(file); +#else + os_file_t file; + int create_flag; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR + | S_IRGRP | S_IWGRP); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; + } else { + *success = TRUE; + } + + return(file); +#endif +} + /******************************************************************** Opens an existing file or creates a new. */ @@ -496,7 +600,11 @@ os_file_create( file is created (if exists, error), OS_FILE_OVERWRITE if a new is created or an old overwritten */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o - is desired, OS_FILE_NORMAL, if any normal file */ + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. and srv_.. + variables whether we really use async i/o or + unbuffered i/o: look in the function source code for + the exact rules */ ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success)/* out: TRUE if succeed, FALSE if error */ { @@ -521,8 +629,8 @@ try_again: } if (purpose == OS_FILE_AIO) { - /* use asynchronous (overlapped) io and no buffering - of writes in the OS */ + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ attributes = 0; #ifdef WIN_ASYNC_IO if (os_aio_use_native_aio) { @@ -530,17 +638,13 @@ try_again: } #endif #ifdef UNIV_NON_BUFFERED_IO - if (type == OS_LOG_FILE) { + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { /* Do not use unbuffered i/o to log files because - to allow group commit to work when MySQL binlogging - is used we must separate log file write and log - file flush to disk. */ - } else { - if (srv_win_file_flush_method == - SRV_WIN_IO_UNBUFFERED) { - attributes = attributes - | FILE_FLAG_NO_BUFFERING; - } + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + } else if (srv_win_file_flush_method == + SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; } #endif } else if (purpose == OS_FILE_NORMAL) { @@ -550,12 +654,9 @@ try_again: /* Do not use unbuffered i/o to log files because value 2 denotes that we do not flush the log at every commit, but only once per second */ - } else { - if (srv_win_file_flush_method == - SRV_WIN_IO_UNBUFFERED) { - attributes = attributes - | FILE_FLAG_NO_BUFFERING; - } + } else if (srv_win_file_flush_method == + SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; } #endif } else { @@ -566,9 +667,14 @@ try_again: file = CreateFile(name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ - FILE_SHARE_READ | FILE_SHARE_WRITE, - /* file can be read and written - also by other processes */ + FILE_SHARE_READ,/* File can be read also by other + processes; we must give the read + permission because of ibbackup. We do + not give the write permission to + others because if one would succeed to + start 2 instances of mysqld on the + SAME files, that could cause severe + database corruption! */ NULL, /* default security attributes */ create_flag, attributes, @@ -592,30 +698,70 @@ try_again: os_file_t file; int create_flag; ibool retry; + const char* mode_str = NULL; + const char* type_str = NULL; + const char* purpose_str = NULL; try_again: ut_a(name); if (create_mode == OS_FILE_OPEN) { + mode_str = "OPEN"; + create_flag = O_RDWR; } else if (create_mode == OS_FILE_CREATE) { + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; } else if (create_mode == OS_FILE_OVERWRITE) { + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; } else { create_flag = 0; ut_error; } - UT_NOT_USED(purpose); + if (type == OS_LOG_FILE) { + type_str = "LOG"; + } else if (type == OS_DATA_FILE) { + type_str = "DATA"; + } else { + ut_a(0); + } + + if (purpose == OS_FILE_AIO) { + purpose_str = "AIO"; + } else if (purpose == OS_FILE_NORMAL) { + purpose_str = "NORMAL"; + } else { + ut_a(0); + } +/* printf("Opening file %s, mode %s, type %s, purpose %s\n", + name, mode_str, type_str, purpose_str); */ #ifdef O_SYNC - if ((!srv_use_doublewrite_buf || type != OS_DATA_FILE) + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + if (type == OS_LOG_FILE && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { +/* printf("Using O_SYNC for file %s\n", name); */ + create_flag = create_flag | O_SYNC; } #endif +#ifdef O_DIRECT + /* We let O_DIRECT only affect data files */ + if (type != OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { + +/* printf("Using O_DIRECT for file %s\n", name); */ + + create_flag = create_flag | O_DIRECT; + } +#endif if (create_mode == OS_FILE_CREATE) { file = open(name, create_flag, os_innodb_umask); } else { @@ -677,6 +823,41 @@ os_file_close( } /*************************************************************************** +Closes a file handle. */ + +ibool +os_file_close_no_error_handling( +/*============================*/ + /* out: TRUE if success */ + os_file_t file) /* in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** Gets a file size. */ ibool @@ -849,6 +1030,7 @@ os_file_flush( #ifdef HAVE_FDATASYNC ret = fdatasync(file); #else +/* printf("Flushing to file %lu\n", (ulint)file); */ ret = fsync(file); #endif os_n_fsyncs++; @@ -896,6 +1078,7 @@ os_file_pread( offset */ { off_t offs; + ssize_t n_bytes; ut_a((offset & 0xFFFFFFFF) == offset); @@ -917,7 +1100,17 @@ os_file_pread( os_n_file_reads++; #ifdef HAVE_PREAD - return(pread(file, buf, n, offs)); + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; + os_mutex_exit(os_file_count_mutex); + + n_bytes = pread(file, buf, n, offs); + + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads--; + os_mutex_exit(os_file_count_mutex); + + return(n_bytes); #else { ssize_t ret; @@ -982,8 +1175,16 @@ os_file_pwrite( os_n_file_writes++; #ifdef HAVE_PWRITE + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites++; + os_mutex_exit(os_file_count_mutex); + ret = pwrite(file, buf, n, offs); + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites--; + os_mutex_exit(os_file_count_mutex); + if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC && srv_unix_file_flush_method != SRV_UNIX_NOSYNC && !os_do_not_call_flush_at_each_write) { @@ -1157,6 +1358,7 @@ os_file_write( DWORD high; ulint i; ulint n_retries = 0; + ulint err; ut_a((offset & 0xFFFFFFFF) == offset); @@ -1224,18 +1426,27 @@ retry: if (!os_has_said_disk_full) { + err = (ulint)GetLastError(); + ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: Write to file %s failed at offset %lu %lu.\n" "InnoDB: %lu bytes should have been written, only %lu were written.\n" "InnoDB: Operating system error number %lu.\n" -"InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n" -"InnoDB: what the error number means.\n" "InnoDB: Check that your OS and file system support files of this size.\n" "InnoDB: Check also that the disk is not full or a disk quota exceeded.\n", name, offset_high, offset, n, (ulint)len, - (ulint)GetLastError()); + err); + + if (strerror((int)err) != NULL) { + fprintf(stderr, +"InnoDB: Error number %lu means '%s'.\n", err, strerror((int)err)); + } + + fprintf(stderr, +"InnoDB: See also section 13.2 at http://www.innodb.com/ibman.html\n" +"InnoDB: about operating system error numbers.\n"); os_has_said_disk_full = TRUE; } @@ -1259,12 +1470,19 @@ retry: " InnoDB: Error: Write to file %s failed at offset %lu %lu.\n" "InnoDB: %lu bytes should have been written, only %ld were written.\n" "InnoDB: Operating system error number %lu.\n" -"InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n" -"InnoDB: what the error number means or use the perror program of MySQL.\n" "InnoDB: Check that your OS and file system support files of this size.\n" "InnoDB: Check also that the disk is not full or a disk quota exceeded.\n", name, offset_high, offset, n, (long int)ret, (ulint)errno); + if (strerror(errno) != NULL) { + fprintf(stderr, +"InnoDB: Error number %lu means '%s'.\n", (ulint)errno, strerror(errno)); + } + + fprintf(stderr, +"InnoDB: See also section 13.2 at http://www.innodb.com/ibman.html\n" +"InnoDB: about operating system error numbers.\n"); + os_has_said_disk_full = TRUE; } @@ -1372,19 +1590,35 @@ os_aio_init( os_io_init_simple(); + for (i = 0; i < n_segments; i++) { + srv_io_thread_op_info[i] = (char*)"not started yet"; + } + n_per_seg = n / n_segments; n_write_segs = (n_segments - 2) / 2; n_read_segs = n_segments - 2 - n_write_segs; /* printf("Array n per seg %lu\n", n_per_seg); */ + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[0] = (char*)"insert buffer thread"; + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[1] = (char*)"log thread"; + os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, n_read_segs); + for (i = 2; i < 2 + n_read_segs; i++) { + srv_io_thread_function[i] = (char*)"read thread"; + } + os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, n_write_segs); - os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); - - os_aio_log_array = os_aio_array_create(n_per_seg, 1); + for (i = 2 + n_read_segs; i < n_segments; i++) { + srv_io_thread_function[i] = (char*)"write thread"; + } os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); @@ -1677,6 +1911,7 @@ loop: } slot->reserved = TRUE; + slot->reservation_time = time(NULL); slot->message1 = message1; slot->message2 = message2; slot->file = file; @@ -2249,6 +2484,8 @@ os_aio_simulated_handle( ulint total_len; ulint offs; ulint lowest_offset; + ulint biggest_age; + ulint age; byte* combined_buf; byte* combined_buf2; ibool ret; @@ -2301,22 +2538,55 @@ restart: n_consecutive = 0; - /* Look for an i/o request at the lowest offset in the array - (we ignore the high 32 bits of the offset in these heuristics) */ + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + biggest_age = 0; lowest_offset = ULINT_MAX; - + for (i = 0; i < n; i++) { slot = os_aio_array_get_nth_slot(array, i + segment * n); - if (slot->reserved && slot->offset < lowest_offset) { + if (slot->reserved) { + age = (ulint)difftime(time(NULL), + slot->reservation_time); - /* Found an i/o request */ - consecutive_ios[0] = slot; + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { - n_consecutive = 1; + /* Found an i/o request */ + consecutive_ios[0] = slot; - lowest_offset = slot->offset; + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = ULINT_MAX; + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, + i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } } } @@ -2422,7 +2692,7 @@ consecutive_loop: + FIL_PAGE_LSN + 4) != mach_read_from_4(combined_buf + len2 + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN + 4)) { + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: The page to be written seems corrupt!\n"); @@ -2583,14 +2853,15 @@ os_aio_print( double avg_bytes_read; ulint i; - if (buf_end - buf < 1000) { + if (buf_end - buf < 1200) { return; } for (i = 0; i < srv_n_file_io_threads; i++) { - buf += sprintf(buf, "I/O thread %lu state: %s\n", i, - srv_io_thread_op_info[i]); + buf += sprintf(buf, "I/O thread %lu state: %s (%s)\n", i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); } buf += sprintf(buf, "Pending normal aio reads:"); @@ -2665,6 +2936,12 @@ loop: "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", os_n_file_reads, os_n_file_writes, os_n_fsyncs); + if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { + buf += sprintf(buf, + "%lu pending preads, %lu pending pwrites\n", + os_file_n_pending_preads, os_file_n_pending_pwrites); + } + if (os_n_file_reads == os_n_file_reads_old) { avg_bytes_read = 0.0; } else { diff --git a/innobase/os/os0sync.c b/innobase/os/os0sync.c index bf5fc57bf57..827d68501db 100644 --- a/innobase/os/os0sync.c +++ b/innobase/os/os0sync.c @@ -143,6 +143,7 @@ os_event_create( ut_a(0 == pthread_cond_init(&(event->cond_var), NULL)); #endif event->is_set = FALSE; + event->signal_count = 0; #endif /* __WIN__ */ /* Put to the list of events */ @@ -218,6 +219,7 @@ os_event_set( /* Do nothing */ } else { event->is_set = TRUE; + event->signal_count += 1; ut_a(0 == pthread_cond_broadcast(&(event->cond_var))); } @@ -310,9 +312,15 @@ os_event_wait( os_thread_exit(NULL); } #else + ib_longlong old_signal_count; + os_fast_mutex_lock(&(event->os_mutex)); + + old_signal_count = event->signal_count; loop: - if (event->is_set == TRUE) { + if (event->is_set == TRUE + || event->signal_count != old_signal_count) { + os_fast_mutex_unlock(&(event->os_mutex)); if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { @@ -326,8 +334,9 @@ loop: pthread_cond_wait(&(event->cond_var), &(event->os_mutex)); - /* Solaris manual said that spurious wakeups may occur: we have - to check the 'is_set' variable again */ + /* Solaris manual said that spurious wakeups may occur: we have to + check if the event really has been signaled after we came here to + wait */ goto loop; #endif diff --git a/innobase/os/os0thread.c b/innobase/os/os0thread.c index 9af98760ad1..1252cc5e4b7 100644 --- a/innobase/os/os0thread.c +++ b/innobase/os/os0thread.c @@ -187,8 +187,8 @@ os_thread_exit( is cast as a DWORD */ { #ifdef UNIV_DEBUG_THREAD_CREATION - printf("A thread exits.\n"); - printf("Thread id %lu\n", os_thread_pf(os_thread_get_curr_id())); + printf("Thread exits, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); #endif os_mutex_enter(os_sync_mutex); os_thread_count--; diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index d3a40668c4b..7e2fc19c00f 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -14,6 +14,7 @@ Created 10/4/1994 Heikki Tuuri #include "rem0cmp.h" #include "mtr0log.h" #include "log0recv.h" +#include "rem0cmp.h" ulint page_cur_short_succ = 0; @@ -218,6 +219,8 @@ page_cur_search_with_match( || (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE) || (mode == PAGE_CUR_LE_OR_EXTENDS) || (mode == PAGE_CUR_DBG)); + page_check_dir(page); + #ifdef PAGE_CUR_ADAPT if ((page_header_get_field(page, PAGE_LEVEL) == 0) && (mode == PAGE_CUR_LE) @@ -595,6 +598,7 @@ page_cur_parse_insert_rec( rec_t* cursor_rec; byte buf1[1024]; byte* buf; + byte* ptr2 = ptr; ulint info_bits = 0; /* remove warning */ page_cur_t cursor; @@ -697,7 +701,20 @@ page_cur_parse_insert_rec( /* Build the inserted record to buf */ - ut_a(mismatch_index < UNIV_PAGE_SIZE); + if (mismatch_index >= UNIV_PAGE_SIZE) { + printf("Is short %lu, info_bits %lu, offset %lu, o_offset %lu\n" + "mismatch index %lu, end_seg_len %lu\n" + "parsed len %lu\n", + is_short, info_bits, offset, origin_offset, + mismatch_index, end_seg_len, (ulint)(ptr - ptr2)); + + printf("Dump of 300 bytes of log:\n"); + ut_print_buf(ptr2, 300); + + buf_page_print(page); + + ut_a(0); + } ut_memcpy(buf, rec_get_start(cursor_rec), mismatch_index); ut_memcpy(buf + mismatch_index, ptr, end_seg_len); diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index 7d240bdd5b0..e087941a970 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -353,7 +353,7 @@ page_create( infimum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); - ut_ad(infimum_rec == page + PAGE_INFIMUM); + ut_a(infimum_rec == page + PAGE_INFIMUM); rec_set_n_owned(infimum_rec, 1); rec_set_heap_no(infimum_rec, 0); @@ -370,7 +370,7 @@ page_create( supremum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); - ut_ad(supremum_rec == page + PAGE_SUPREMUM); + ut_a(supremum_rec == page + PAGE_SUPREMUM); rec_set_n_owned(supremum_rec, 1); rec_set_heap_no(supremum_rec, 1); @@ -389,6 +389,8 @@ page_create( page_header_set_ptr(page, PAGE_FREE, NULL); page_header_set_field(page, PAGE_GARBAGE, 0); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); + page_header_set_field(page, PAGE_N_DIRECTION, 0); page_header_set_field(page, PAGE_N_RECS, 0); page_set_max_trx_id(page, ut_dulint_zero); @@ -402,17 +404,22 @@ page_create( slot = page_dir_get_nth_slot(page, 1); page_dir_slot_set_rec(slot, supremum_rec); - /* Set next pointers in infimum and supremum */ + /* Set the next pointers in infimum and supremum */ rec_set_next_offs(infimum_rec, (ulint)(supremum_rec - page)); rec_set_next_offs(supremum_rec, 0); +#ifdef notdefined + /* Disable the use of page_template: there is a race condition here: + while one thread is creating page_template, another one can start + using it before the memcpy completes! */ + if (page_template == NULL) { page_template = mem_alloc(UNIV_PAGE_SIZE); ut_memcpy(page_template, page, UNIV_PAGE_SIZE); } - +#endif return(page); } @@ -439,6 +446,8 @@ page_copy_rec_list_end_no_locks( page_cur_move_to_next(&cur1); } + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == PAGE_INFIMUM); + page_cur_set_before_first(new_page, &cur2); /* Copy records from the original page to the new page */ @@ -446,8 +455,22 @@ page_copy_rec_list_end_no_locks( sup = page_get_supremum_rec(page); while (sup != page_cur_get_rec(&cur1)) { - ut_a( - page_cur_rec_insert(&cur2, page_cur_get_rec(&cur1), mtr)); + if (!page_cur_rec_insert(&cur2, + page_cur_get_rec(&cur1), mtr)) { + /* Track an assertion failure reported on the mailing + list on June 18th, 2003 */ + + buf_page_print(new_page); + buf_page_print(page); + ut_print_timestamp(stderr); + + fprintf(stderr, +"InnoDB: rec offset %lu, cur1 offset %lu, cur2 offset %lu\n", + (ulint)(rec - page), + (ulint)(page_cur_get_rec(&cur1) - page), + (ulint)(page_cur_get_rec(&cur2) - new_page)); + ut_a(0); + } page_cur_move_to_next(&cur1); page_cur_move_to_next(&cur2); @@ -1315,6 +1338,37 @@ page_rec_validate( return(TRUE); } + +/******************************************************************* +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ + +void +page_check_dir( +/*===========*/ + page_t* page) /* in: index page */ +{ + ulint n_slots; + + n_slots = page_dir_get_n_slots(page); + + if (page_dir_slot_get_rec(page_dir_get_nth_slot(page, 0)) + != page_get_infimum_rec(page)) { + + fprintf(stderr, +"InnoDB: Page directory corruption: supremum not pointed to\n"); + buf_page_print(page); + } + + if (page_dir_slot_get_rec(page_dir_get_nth_slot(page, n_slots - 1)) + != page_get_supremum_rec(page)) { + + fprintf(stderr, +"InnoDB: Page directory corruption: supremum not pointed to\n"); + buf_page_print(page); + } +} /******************************************************************* This function checks the consistency of an index page when we do not @@ -1598,7 +1652,8 @@ page_validate( "InnoDB: previous record %s\n", err_buf); rec_sprintf(err_buf, 900, rec); - fprintf(stderr, "InnoDB: record %s\n", err_buf); + fprintf(stderr, + "InnoDB: record %s\n", err_buf); goto func_exit; } diff --git a/innobase/pars/pars0opt.c b/innobase/pars/pars0opt.c index 91083e6fa16..4faf83b47a3 100644 --- a/innobase/pars/pars0opt.c +++ b/innobase/pars/pars0opt.c @@ -1058,7 +1058,6 @@ opt_clust_access( dfield_t* dfield; mem_heap_t* heap; ulint n_fields; - ulint col_no; ulint pos; ulint i; @@ -1093,8 +1092,7 @@ opt_clust_access( plan->clust_map = mem_heap_alloc(heap, n_fields * sizeof(ulint)); for (i = 0; i < n_fields; i++) { - col_no = dict_index_get_nth_col_no(clust_index, i); - pos = dict_index_get_nth_col_pos(index, col_no); + pos = dict_index_get_nth_field_pos(index, clust_index, i); *(plan->clust_map + i) = pos; @@ -1109,7 +1107,8 @@ opt_clust_access( dfield = dtuple_get_nth_field(plan->clust_ref, table->mix_len); - dfield_set_data(dfield, mem_heap_alloc(heap, table->mix_id_len), + dfield_set_data(dfield, mem_heap_alloc(heap, + table->mix_id_len), table->mix_id_len); ut_memcpy(dfield_get_data(dfield), table->mix_id_buf, table->mix_id_len); diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c index 664f498ef3e..3e43b6ae262 100644 --- a/innobase/pars/pars0pars.c +++ b/innobase/pars/pars0pars.c @@ -244,13 +244,11 @@ pars_resolve_func_data_type( /* Inherit the data type from the first argument (which must not be the SQL null literal whose type is DATA_ERROR) */ - ut_a(dtype_get_mtype(que_node_get_data_type(arg)) - != DATA_ERROR); dtype_copy(que_node_get_data_type(node), que_node_get_data_type(arg)); - ut_a(dtype_get_mtype(que_node_get_data_type(node)) == DATA_INT); - + ut_a(dtype_get_mtype(que_node_get_data_type(node)) + == DATA_INT); } else if (func == PARS_COUNT_TOKEN) { ut_a(arg); dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4, 0); @@ -1596,7 +1594,7 @@ pars_create_index( column = column_list; while (column) { - dict_mem_index_add_field(index, column->name, 0); + dict_mem_index_add_field(index, column->name, 0, 0); column->resolved = TRUE; column->token_type = SYM_COLUMN; diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index e9740d7ea78..2e18e68ec43 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -38,7 +38,7 @@ Used in debug checking of cmp_dtuple_... . This function is used to compare a data tuple to a physical record. If dtuple has n fields then rec must have either m >= n fields, or it must differ from dtuple in some of the m fields rec has. */ -static + int cmp_debug_dtuple_rec_with_match( /*============================*/ @@ -50,9 +50,10 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ - ulint* matched_fields);/* in/out: number of already completely - matched fields; when function returns, - contains the value for current comparison */ + ulint* matched_fields);/* in/out: number of already + completely matched fields; when function + returns, contains the value for current + comparison */ /***************************************************************** This function is used to compare two data fields for which the data type is such that we must use MySQL code to compare them. The prototype here @@ -79,17 +80,12 @@ UNIV_INLINE ulint cmp_collate( /*========*/ - /* out: collation order position */ - dtype_t* type __attribute__((unused)) , /* in: type */ - ulint code) /* in: code of a character stored in database - record */ -{ - ut_ad((type->mtype == DATA_CHAR) || (type->mtype == DATA_VARCHAR)); - + /* out: collation order position */ + ulint code) /* in: code of a character stored in database record */ +{ return((ulint) srv_latin1_ordering[code]); } - /***************************************************************** Returns TRUE if two types are equal for comparison purposes. */ @@ -118,7 +114,8 @@ cmp_types_are_equal( if (type1->mtype == DATA_INT && (type1->prtype & DATA_UNSIGNED) - != (type2->prtype & DATA_UNSIGNED)) { + != (type2->prtype & DATA_UNSIGNED)) { + /* The storage format of an unsigned integer is different from a signed integer: in a signed integer we OR 0x8000... to the value of positive integers. */ @@ -131,12 +128,17 @@ cmp_types_are_equal( return(FALSE); } + if (type1->mtype == DATA_BLOB && (type1->prtype & DATA_BINARY_TYPE) + != (type2->prtype & DATA_BINARY_TYPE)) { + return(FALSE); + } + return(TRUE); } /***************************************************************** -Innobase uses this function is to compare two data fields for which the -data type is such that we must compare whole fields. */ +Innobase uses this function to compare two data fields for which the data type +is such that we must compare whole fields or call MySQL to do the comparison */ static int cmp_whole_field( @@ -239,8 +241,34 @@ cmp_whole_field( return(0); case DATA_VARMYSQL: case DATA_MYSQL: + case DATA_BLOB: + if (data_type == DATA_BLOB + && 0 != (type->prtype & DATA_BINARY_TYPE)) { + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: comparing a binary BLOB with a character set sensitive\n" +"InnoDB: comparison!\n"); + } + + /* MySQL does not pad the ends of strings with spaces in a + comparison. That would cause a foreign key check to fail for + non-latin1 character sets if we have different length columns. + To prevent that we remove trailing spaces here before doing + the comparison. NOTE that if we in the future map more MySQL + types to DATA_MYSQL or DATA_VARMYSQL, we have to change this + code. */ + + while (a_length > 0 && a[a_length - 1] == ' ') { + a_length--; + } + + while (b_length > 0 && b[b_length - 1] == ' ') { + b_length--; + } + return(innobase_mysql_cmp( - (int)(type->prtype & ~DATA_NOT_NULL), + (int)(type->prtype & DATA_MYSQL_TYPE_MASK), a, a_length, b, b_length)); default: fprintf(stderr, @@ -291,7 +319,10 @@ cmp_data_data_slow( return(1); } - if (cur_type->mtype >= DATA_FLOAT) { + if (cur_type->mtype >= DATA_FLOAT + || (cur_type->mtype == DATA_BLOB + && (cur_type->prtype & DATA_NONLATIN1))) { + return(cmp_whole_field(cur_type, data1, len1, data2, len2)); } @@ -334,9 +365,12 @@ cmp_data_data_slow( goto next_byte; } - if (cur_type->mtype <= DATA_CHAR) { - data1_byte = cmp_collate(cur_type, data1_byte); - data2_byte = cmp_collate(cur_type, data2_byte); + if (cur_type->mtype <= DATA_CHAR + || (cur_type->mtype == DATA_BLOB + && 0 == (cur_type->prtype & DATA_BINARY_TYPE))) { + + data1_byte = cmp_collate(data1_byte); + data2_byte = cmp_collate(data2_byte); } if (data1_byte > data2_byte) { @@ -487,7 +521,9 @@ cmp_dtuple_rec_with_match( } } - if (cur_type->mtype >= DATA_FLOAT) { + if (cur_type->mtype >= DATA_FLOAT + || (cur_type->mtype == DATA_BLOB + && (cur_type->prtype & DATA_NONLATIN1))) { ret = cmp_whole_field(cur_type, dfield_get_data(dtuple_field), dtuple_f_len, @@ -547,10 +583,13 @@ cmp_dtuple_rec_with_match( goto next_byte; } - if (cur_type->mtype <= DATA_CHAR) { - rec_byte = cmp_collate(cur_type, rec_byte); - dtuple_byte = cmp_collate(cur_type, - dtuple_byte); + if (cur_type->mtype <= DATA_CHAR + || (cur_type->mtype == DATA_BLOB + && 0 == + (cur_type->prtype & DATA_BINARY_TYPE))) { + + rec_byte = cmp_collate(rec_byte); + dtuple_byte = cmp_collate(dtuple_byte); } if (dtuple_byte > rec_byte) { @@ -583,8 +622,8 @@ order_resolved: matched_fields)); ut_ad(*matched_fields == cur_field); /* In the debug version, the above cmp_debug_... sets - *matched_fields to a value */ - *matched_fields = cur_field; + *matched_fields to a value */ + *matched_fields = cur_field; *matched_bytes = cur_bytes; return(ret); @@ -804,7 +843,10 @@ cmp_rec_rec_with_match( } } - if (cur_type->mtype >= DATA_FLOAT) { + if (cur_type->mtype >= DATA_FLOAT + || (cur_type->mtype == DATA_BLOB + && (cur_type->prtype & DATA_NONLATIN1))) { + ret = cmp_whole_field(cur_type, rec1_b_ptr, rec1_f_len, rec2_b_ptr, rec2_f_len); @@ -861,9 +903,13 @@ cmp_rec_rec_with_match( goto next_byte; } - if (cur_type->mtype <= DATA_CHAR) { - rec1_byte = cmp_collate(cur_type, rec1_byte); - rec2_byte = cmp_collate(cur_type, rec2_byte); + if (cur_type->mtype <= DATA_CHAR + || (cur_type->mtype == DATA_BLOB + && 0 == + (cur_type->prtype & DATA_BINARY_TYPE))) { + + rec1_byte = cmp_collate(rec1_byte); + rec2_byte = cmp_collate(rec2_byte); } if (rec1_byte < rec2_byte) { @@ -906,7 +952,7 @@ This function is used to compare a data tuple to a physical record. If dtuple has n fields then rec must have either m >= n fields, or it must differ from dtuple in some of the m fields rec has. If encounters an externally stored field, returns 0. */ -static + int cmp_debug_dtuple_rec_with_match( /*============================*/ @@ -918,9 +964,10 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ - ulint* matched_fields) /* in/out: number of already completely - matched fields; when function returns, - contains the value for current comparison */ + ulint* matched_fields) /* in/out: number of already + completely matched fields; when function + returns, contains the value for current + comparison */ { dtype_t* cur_type; /* pointer to type of the current field in dtuple */ diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index e96c08a715b..e02859bc851 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -217,8 +217,8 @@ ins_node_set_new_row( } /*********************************************************************** -Does an insert operation by updating a delete marked existing record -in the index. This situation can occur if the delete marked record is +Does an insert operation by updating a delete-marked existing record +in the index. This situation can occur if the delete-marked record is kept in the index for consistent reads. */ static ulint @@ -240,9 +240,9 @@ row_ins_sec_index_entry_by_modify( ut_ad((cursor->index->type & DICT_CLUSTERED) == 0); ut_ad(rec_get_deleted_flag(rec)); - /* We know that in the ordering entry and rec are identified. - But in their binary form there may be differences if there - are char fields in them. Therefore we have to calculate the + /* We know that in the alphabetical ordering, entry and rec are + identical. But in their binary form there may be differences if + there are char fields in them. Therefore we have to calculate the difference and do an update-in-place if necessary. */ heap = mem_heap_create(1024); @@ -305,8 +305,8 @@ row_ins_clust_index_entry_by_modify( /* Try optimistic updating of the record, keeping changes within the page */ - err = btr_cur_optimistic_update(0, cursor, update, 0, thr, mtr); - + err = btr_cur_optimistic_update(0, cursor, update, 0, thr, + mtr); if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { err = DB_FAIL; } @@ -364,11 +364,17 @@ row_ins_cascade_calc_update_vec( /* out: number of fields in the calculated update vector; the value can also be 0 if no foreign key - fields changed */ + fields changed; the returned value + is ULINT_UNDEFINED if the column + type in the child table is too short + to fit the new value in the parent + table: that means the update fails */ upd_node_t* node, /* in: update node of the parent table */ - dict_foreign_t* foreign) /* in: foreign key constraint whose + dict_foreign_t* foreign, /* in: foreign key constraint whose type is != 0 */ + mem_heap_t* heap) /* in: memory heap to use as + temporary storage */ { upd_node_t* cascade = node->cascade_node; dict_table_t* table = foreign->foreign_table; @@ -381,14 +387,16 @@ row_ins_cascade_calc_update_vec( upd_field_t* parent_ufield; ulint n_fields_updated; ulint parent_field_no; + dtype_t* type; ulint i; ulint j; ut_a(node && foreign && cascade && table && index); /* Calculate the appropriate update vector which will set the fields - in the child index record to the same value as the referenced index - record will get in the update. */ + in the child index record to the same value (possibly padded with + spaces if the column is a fixed length CHAR or FIXBINARY column) as + the referenced index record will get in the update. */ parent_table = node->table; ut_a(parent_table == foreign->referenced_table); @@ -424,7 +432,56 @@ row_ins_cascade_calc_update_vec( dict_table_get_nth_col_pos(table, dict_index_get_nth_col_no(index, i)); ufield->exp = NULL; + ufield->new_val = parent_ufield->new_val; + + type = dict_index_get_nth_type(index, i); + + /* Do not allow a NOT NULL column to be + updated as NULL */ + + if (ufield->new_val.len == UNIV_SQL_NULL + && (type->prtype & DATA_NOT_NULL)) { + + return(ULINT_UNDEFINED); + } + + /* If the new value would not fit in the + column, do not allow the update */ + + if (ufield->new_val.len != UNIV_SQL_NULL + && ufield->new_val.len + > dtype_get_len(type)) { + + return(ULINT_UNDEFINED); + } + + /* If the parent column type has a different + length than the child column type, we may + need to pad with spaces the new value of the + child column */ + + if (dtype_is_fixed_size(type) + && ufield->new_val.len != UNIV_SQL_NULL + && ufield->new_val.len + < dtype_get_fixed_size(type)) { + + ufield->new_val.data = + mem_heap_alloc(heap, + dtype_get_fixed_size(type)); + ufield->new_val.len = + dtype_get_fixed_size(type); + ut_a(dtype_get_pad_char(type) + != ULINT_UNDEFINED); + + memset(ufield->new_val.data, + (byte)dtype_get_pad_char(type), + dtype_get_fixed_size(type)); + ut_memcpy(ufield->new_val.data, + parent_ufield->new_val.data, + parent_ufield->new_val.len); + } + ufield->extern_storage = FALSE; n_fields_updated++; @@ -570,9 +627,11 @@ row_ins_foreign_check_on_constraint( dict_index_t* clust_index; dtuple_t* ref; mem_heap_t* tmp_heap; + mem_heap_t* upd_vec_heap = NULL; rec_t* rec; rec_t* clust_rec; upd_t* update; + ulint n_to_update; ulint err; ulint i; char* ptr; @@ -597,8 +656,10 @@ row_ins_foreign_check_on_constraint( *ptr = '\0'; /* We call a function in ha_innodb.cc */ +#ifndef UNIV_HOTBACKUP innobase_invalidate_query_cache(thr_get_trx(thr), table_name_buf, ut_strlen(table->name) + 1); +#endif node = thr->run_node; if (node->is_delete && 0 == (foreign->type & @@ -730,27 +791,30 @@ row_ins_foreign_check_on_constraint( mem_heap_free(tmp_heap); clust_rec = btr_pcur_get_rec(cascade->pcur); - } - if (!page_rec_is_user_rec(clust_rec)) { - fprintf(stderr, + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(cascade->pcur) + < dict_index_get_n_unique(clust_index)) { + + fprintf(stderr, "InnoDB: error in cascade of a foreign key op\n" "InnoDB: index %s table %s\n", index->name, index->table->name); - rec_sprintf(err_buf, 900, rec); - fprintf(stderr, "InnoDB: record %s\n", err_buf); + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, "InnoDB: record %s\n", err_buf); - rec_sprintf(err_buf, 900, clust_rec); - fprintf(stderr, "InnoDB: clustered record %s\n", err_buf); - - fprintf(stderr, + rec_sprintf(err_buf, 900, clust_rec); + fprintf(stderr, "InnoDB: clustered record %s\n", + err_buf); + fprintf(stderr, "InnoDB: Make a detailed bug report and send it\n"); - fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); + fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); - err = DB_SUCCESS; + err = DB_SUCCESS; - goto nonstandard_exit_func; + goto nonstandard_exit_func; + } } /* Set an X-lock on the row to delete or update in the child table */ @@ -828,7 +892,21 @@ row_ins_foreign_check_on_constraint( /* Build the appropriate update vector which sets changing foreign->n_fields first fields in rec to new values */ - row_ins_cascade_calc_update_vec(node, foreign); + upd_vec_heap = mem_heap_create(256); + + n_to_update = row_ins_cascade_calc_update_vec(node, foreign, + upd_vec_heap); + if (n_to_update == ULINT_UNDEFINED) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( +(char*)"Trying a cascaded update where the updated value in the child\n" +"table would not fit in the length of the column, or the value would\n" +"be NULL and the column is declared as not NULL in the child table,", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } if (cascade->update->n_fields == 0) { @@ -867,10 +945,18 @@ row_ins_foreign_check_on_constraint( btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + return(err); nonstandard_exit_func: + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); @@ -1275,6 +1361,11 @@ row_ins_unique_report_err( dtuple_t* entry, /* in: index entry to insert in the index */ dict_index_t* index) /* in: index */ { + UT_NOT_USED(thr); + UT_NOT_USED(rec); + UT_NOT_USED(entry); + UT_NOT_USED(index); + #ifdef notdefined /* Disable reporting to test if the slowdown of REPLACE in 4.0.13 was caused by this! */ @@ -1816,7 +1907,7 @@ row_ins_index_entry( /* Try first optimistic descent to the B-tree */ err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, - ext_vec, n_ext_vec, thr); + ext_vec, n_ext_vec, thr); if (err != DB_FAIL) { return(err); @@ -1832,13 +1923,15 @@ row_ins_index_entry( /*************************************************************** Sets the values of the dtuple fields in entry from the values of appropriate columns in row. */ -UNIV_INLINE +static void row_ins_index_entry_set_vals( /*=========================*/ + dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry to make */ dtuple_t* row) /* in: row */ { + dict_field_t* ind_field; dfield_t* field; dfield_t* row_field; ulint n_fields; @@ -1850,11 +1943,21 @@ row_ins_index_entry_set_vals( for (i = 0; i < n_fields; i++) { field = dtuple_get_nth_field(entry, i); + ind_field = dict_index_get_nth_field(index, i); + + row_field = dtuple_get_nth_field(row, ind_field->col->ind); - row_field = dtuple_get_nth_field(row, field->col_no); + /* Check column prefix indexes */ + if (ind_field->prefix_len > 0 + && dfield_get_len(row_field) != UNIV_SQL_NULL + && dfield_get_len(row_field) > ind_field->prefix_len) { + + field->len = ind_field->prefix_len; + } else { + field->len = row_field->len; + } field->data = row_field->data; - field->len = row_field->len; } } @@ -1873,7 +1976,7 @@ row_ins_index_entry_step( ut_ad(dtuple_check_typed(node->row)); - row_ins_index_entry_set_vals(node->entry, node->row); + row_ins_index_entry_set_vals(node->index, node->entry, node->row); ut_ad(dtuple_check_typed(node->entry)); diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 6d1f6f6e40e..35305b037c6 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -89,9 +89,6 @@ row_mysql_store_blob_ref( also to set the NULL bit in the MySQL record header! */ { - ulint sum = 0; - ulint i; - /* MySQL might assume the field is set to zero except the length and the pointer fields */ @@ -106,22 +103,6 @@ row_mysql_store_blob_ref( ut_a(col_len - 8 > 2 || len < 256 * 256); ut_a(col_len - 8 > 3 || len < 256 * 256 * 256); - /* We try to track an elusive bug which probably was fixed - May 9, 2002, but better be sure: we probe the data buffer - to make sure it is in valid allocated memory */ - - for (i = 0; i < len; i++) { - - sum += (ulint)(data + i); - } - - /* The variable below is identically false, we just fool the - compiler to not optimize away our loop */ - if (row_mysql_identically_false) { - - printf("Sum %lu\n", sum); - } - mach_write_to_n_little_endian(dest, col_len - 8, len); ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*)); @@ -539,6 +520,7 @@ row_get_prebuilt_insert_row( ins_node_t* node; dtuple_t* row; dict_table_t* table = prebuilt->table; + ulint i; ut_ad(prebuilt && table && prebuilt->trx); @@ -562,6 +544,14 @@ row_get_prebuilt_insert_row( dict_table_copy_types(row, table); + /* We init the value of every field to the SQL NULL to avoid + a debug assertion from failing */ + + for (i = 0; i < dtuple_get_n_fields(row); i++) { + + dtuple_get_nth_field(row, i)->len = UNIV_SQL_NULL; + } + ins_node_set_new_row(node, row); prebuilt->ins_graph = @@ -965,7 +955,8 @@ row_update_for_mysql( if (prebuilt->pcur->btr_cur.index == clust_index) { btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur); } else { - btr_pcur_copy_stored_position(node->pcur, prebuilt->clust_pcur); + btr_pcur_copy_stored_position(node->pcur, + prebuilt->clust_pcur); } ut_a(node->pcur->rel_pos == BTR_PCUR_ON); @@ -1490,8 +1481,7 @@ row_create_index_for_mysql( ulint namelen; ulint keywordlen; ulint err; - ulint i; - ulint j; + ulint i, j; ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1499,23 +1489,9 @@ row_create_index_for_mysql( trx->op_info = (char *) "creating index"; - trx_start_if_not_started(trx); - - namelen = ut_strlen(index->table_name); - - keywordlen = ut_strlen("_recover_innodb_tmp_table"); - - if (namelen >= keywordlen - && 0 == ut_memcmp( - index->table_name + namelen - keywordlen, - (char*)"_recover_innodb_tmp_table", keywordlen)) { - - return(DB_SUCCESS); - } - /* Check that the same column does not appear twice in the index. - InnoDB assumes this in its algorithms, e.g., update of an index - entry */ + Starting from 4.0.14 InnoDB should be able to cope with that, but + safer not to allow them. */ for (i = 0; i < dict_index_get_n_fields(index); i++) { for (j = 0; j < i; j++) { @@ -1538,6 +1514,20 @@ row_create_index_for_mysql( } } + trx_start_if_not_started(trx); + + namelen = ut_strlen(index->table_name); + + keywordlen = ut_strlen("_recover_innodb_tmp_table"); + + if (namelen >= keywordlen + && 0 == ut_memcmp( + index->table_name + namelen - keywordlen, + (char*)"_recover_innodb_tmp_table", keywordlen)) { + + return(DB_SUCCESS); + } + heap = mem_heap_create(512); trx->dict_operation = TRUE; @@ -1555,6 +1545,7 @@ row_create_index_for_mysql( que_graph_free((que_t*) que_node_get_parent(thr)); error_handling: + if (err != DB_SUCCESS) { /* We have special error handling here */ @@ -1677,7 +1668,7 @@ row_drop_table_for_mysql_in_background( the InnoDB data dictionary get out-of-sync if the user runs with innodb_flush_log_at_trx_commit = 0 */ - log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); + log_buffer_flush_to_disk(); trx_commit_for_mysql(trx); @@ -1959,7 +1950,8 @@ row_drop_table_for_mysql( " found := 0;\n" " ELSE" " DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n" - " DELETE FROM SYS_INDEXES WHERE ID = index_id;\n" + " DELETE FROM SYS_INDEXES WHERE ID = index_id\n" + " AND TABLE_ID = table_id;\n" " END IF;\n" "END LOOP;\n" "DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n" @@ -2554,7 +2546,7 @@ loop: prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); - ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); goto loop; } diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c index 40a775143f4..6c0c6c04cd5 100644 --- a/innobase/row/row0row.c +++ b/innobase/row/row0row.c @@ -136,7 +136,14 @@ row_build_index_entry( dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); dfield_copy(dfield, dfield2); - dfield->col_no = dict_col_get_no(col); + + /* If a column prefix index, take only the prefix */ + if (ind_field->prefix_len > 0 + && dfield_get_len(dfield2) != UNIV_SQL_NULL + && dfield_get_len(dfield2) > ind_field->prefix_len) { + + dfield_set_len(dfield, ind_field->prefix_len); + } } ut_ad(dtuple_check_typed(entry)); @@ -146,8 +153,7 @@ row_build_index_entry( /*********************************************************************** An inverse function to dict_row_build_index_entry. Builds a row from a -record in a clustered index. NOTE that externally stored (often big) -fields are always copied to heap. */ +record in a clustered index. */ dtuple_t* row_build( @@ -172,6 +178,7 @@ row_build( { dtuple_t* row; dict_table_t* table; + dict_field_t* ind_field; dict_col_t* col; dfield_t* dfield; ulint n_fields; @@ -204,19 +211,24 @@ row_build( dict_table_copy_types(row, table); for (i = 0; i < n_fields; i++) { + ind_field = dict_index_get_nth_field(index, i); - col = dict_field_get_col(dict_index_get_nth_field(index, i)); - dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - field = rec_get_nth_field(rec, i, &len); + if (ind_field->prefix_len == 0) { - if (type == ROW_COPY_ALSO_EXTERNALS - && rec_get_nth_field_extern_bit(rec, i)) { + col = dict_field_get_col(ind_field); + dfield = dtuple_get_nth_field(row, + dict_col_get_no(col)); + field = rec_get_nth_field(rec, i, &len); - field = btr_rec_copy_externally_stored_field(rec, - i, &len, heap); - } + if (type == ROW_COPY_ALSO_EXTERNALS + && rec_get_nth_field_extern_bit(rec, i)) { - dfield_set_data(dfield, field, len); + field = btr_rec_copy_externally_stored_field( + rec, i, &len, heap); + } + + dfield_set_data(dfield, field, len); + } } ut_ad(dtuple_check_typed(row)); @@ -371,7 +383,6 @@ row_build_row_ref( dict_table_t* table; dict_index_t* clust_index; dfield_t* dfield; - dict_col_t* col; dtuple_t* ref; byte* field; ulint len; @@ -403,24 +414,13 @@ row_build_row_ref( for (i = 0; i < ref_len; i++) { dfield = dtuple_get_nth_field(ref, i); - col = dict_field_get_col( - dict_index_get_nth_field(clust_index, i)); - pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col)); + pos = dict_index_get_nth_field_pos(index, clust_index, i); - if (pos != ULINT_UNDEFINED) { - field = rec_get_nth_field(rec, pos, &len); + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, pos, &len); - dfield_set_data(dfield, field, len); - } else { - ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER); - ut_ad(i == table->mix_len); - - dfield_set_data(dfield, - mem_heap_alloc(heap, table->mix_id_len), - table->mix_id_len); - ut_memcpy(dfield_get_data(dfield), table->mix_id_buf, - table->mix_id_len); - } + dfield_set_data(dfield, field, len); } ut_ad(dtuple_check_typed(ref)); @@ -448,7 +448,6 @@ row_build_row_ref_in_tuple( dict_table_t* table; dict_index_t* clust_index; dfield_t* dfield; - dict_col_t* col; byte* field; ulint len; ulint ref_len; @@ -483,19 +482,13 @@ row_build_row_ref_in_tuple( for (i = 0; i < ref_len; i++) { dfield = dtuple_get_nth_field(ref, i); - col = dict_field_get_col( - dict_index_get_nth_field(clust_index, i)); - pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col)); + pos = dict_index_get_nth_field_pos(index, clust_index, i); - if (pos != ULINT_UNDEFINED) { - field = rec_get_nth_field(rec, pos, &len); + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, pos, &len); - dfield_set_data(dfield, field, len); - } else { - ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER); - ut_ad(i == table->mix_len); - ut_a(0); - } + dfield_set_data(dfield, field, len); } ut_ad(dtuple_check_typed(ref)); @@ -517,6 +510,7 @@ row_build_row_ref_from_row( directly into data of this row */ { dict_index_t* clust_index; + dict_field_t* field; dfield_t* dfield; dfield_t* dfield2; dict_col_t* col; @@ -533,13 +527,21 @@ row_build_row_ref_from_row( for (i = 0; i < ref_len; i++) { dfield = dtuple_get_nth_field(ref, i); - - col = dict_field_get_col( - dict_index_get_nth_field(clust_index, i)); - + + field = dict_index_get_nth_field(clust_index, i); + + col = dict_field_get_col(field); + dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); dfield_copy(dfield, dfield2); + + if (field->prefix_len > 0 + && dfield->len != UNIV_SQL_NULL + && dfield->len > field->prefix_len) { + + dfield->len = field->prefix_len; + } } ut_ad(dtuple_check_typed(ref)); diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index 97a69f76eaa..81bbf5053c0 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -65,41 +65,50 @@ row_sel_sec_rec_is_for_clust_rec( rec_t* sec_rec, /* in: secondary index record */ dict_index_t* sec_index, /* in: secondary index */ rec_t* clust_rec, /* in: clustered index record */ - dict_index_t* clust_index __attribute__((unused))) - /* in: clustered index */ + dict_index_t* clust_index) /* in: clustered index */ { - dict_col_t* col; - byte* sec_field; - ulint sec_len; - byte* clust_field; - ulint clust_len; - ulint n; - ulint i; + dict_field_t* ifield; + dict_col_t* col; + byte* sec_field; + ulint sec_len; + byte* clust_field; + ulint clust_len; + ulint n; + ulint i; - n = dict_index_get_n_ordering_defined_by_user(sec_index); + UT_NOT_USED(clust_index); - for (i = 0; i < n; i++) { - col = dict_field_get_col( - dict_index_get_nth_field(sec_index, i)); + n = dict_index_get_n_ordering_defined_by_user(sec_index); - clust_field = rec_get_nth_field(clust_rec, - dict_col_get_clust_pos(col), - &clust_len); - sec_field = rec_get_nth_field(sec_rec, i, &sec_len); + for (i = 0; i < n; i++) { + ifield = dict_index_get_nth_field(sec_index, i); + col = dict_field_get_col(ifield); + + clust_field = rec_get_nth_field(clust_rec, + dict_col_get_clust_pos(col), + &clust_len); + sec_field = rec_get_nth_field(sec_rec, i, &sec_len); - if (sec_len != clust_len) { + if (ifield->prefix_len > 0 + && clust_len != UNIV_SQL_NULL + && clust_len > ifield->prefix_len) { - return(FALSE); + clust_len = ifield->prefix_len; } - if (0 != cmp_data_data(dict_col_get_type(col), - clust_field, clust_len, - sec_field, sec_len)) { - return(FALSE); - } - } + if (sec_len != clust_len) { - return(TRUE); + return(FALSE); + } + + if (0 != cmp_data_data(dict_col_get_type(col), + clust_field, clust_len, + sec_field, sec_len)) { + return(FALSE); + } + } + + return(TRUE); } /************************************************************************* @@ -600,13 +609,35 @@ row_sel_get_clust_rec( clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); - ut_ad(page_rec_is_user_rec(clust_rec)); + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&(plan->clust_pcur)) + < dict_index_get_n_unique(index)) { + + ut_a(rec_get_deleted_flag(rec)); + ut_a(node->read_view); + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + clust_rec = NULL; + + goto func_exit; + } if (!node->read_view) { /* Try to place a lock on the index record */ err = lock_clust_rec_read_check_and_lock(0, clust_rec, index, - node->row_lock_mode, LOCK_ORDINARY, thr); + node->row_lock_mode, LOCK_ORDINARY, thr); if (err != DB_SUCCESS) { return(err); @@ -656,13 +687,14 @@ row_sel_get_clust_rec( *out_rec = clust_rec; return(DB_SUCCESS); - } + } } /* Fetch the columns needed in test conditions */ row_sel_fetch_columns(index, clust_rec, UT_LIST_GET_FIRST(plan->columns)); +func_exit: *out_rec = clust_rec; return(DB_SUCCESS); @@ -1244,6 +1276,8 @@ rec_loop: /* PHASE 3: Get previous version in a consistent read */ + cons_read_requires_clust_rec = FALSE; + if (consistent_read) { /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ @@ -1850,9 +1884,11 @@ row_printf_step( } /******************************************************************** -Converts a key value stored in MySQL format to an Innobase dtuple. -The last field of the key value may be just a prefix of a fixed length -field: hence the parameter key_len. */ +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ void row_sel_convert_mysql_key_to_innobase( @@ -1863,17 +1899,24 @@ row_sel_convert_mysql_key_to_innobase( to index! */ byte* buf, /* in: buffer to use in field conversions */ + ulint buf_len, /* in: buffer length */ dict_index_t* index, /* in: index of the key value */ byte* key_ptr, /* in: MySQL key value */ ulint key_len) /* in: MySQL key value length */ { + byte* original_buf = buf; + dict_field_t* field; dfield_t* dfield; - ulint offset; - ulint len; + ulint data_offset; + ulint data_len; + ulint data_field_len; + ibool is_null; byte* key_end; ulint n_fields = 0; + ulint type; - UT_NOT_USED(index); + /* For documentation of the key value storage format in MySQL, see + ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ key_end = key_ptr + key_len; @@ -1882,11 +1925,14 @@ row_sel_convert_mysql_key_to_innobase( dtuple_set_n_fields(tuple, ULINT_MAX); dfield = dtuple_get_nth_field(tuple, 0); + field = dict_index_get_nth_field(index, 0); if (dfield_get_type(dfield)->mtype == DATA_SYS) { - /* A special case: we are looking for a position in a - generated clustered index: the first and the only - ordering column is ROW_ID */ + /* A special case: we are looking for a position in the + generated clustered index which InnoDB automatically added + to a table with no primary key: the first and the only + ordering column is ROW_ID which InnoDB stored to the key_ptr + buffer. */ ut_a(key_len == DATA_ROW_ID_LEN); @@ -1897,70 +1943,114 @@ row_sel_convert_mysql_key_to_innobase( return; } - while (key_ptr < key_end) { - offset = 0; - len = dfield_get_type(dfield)->len; + while (key_ptr < key_end) { - n_fields++; + ut_a(dict_col_get_type(field->col)->mtype + == dfield_get_type(dfield)->mtype); + + data_offset = 0; + is_null = FALSE; if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { /* The first byte in the field tells if this is an SQL NULL value */ - offset = 1; + data_offset = 1; - if (*key_ptr != 0) { + if (*key_ptr != 0) { dfield_set_data(dfield, NULL, UNIV_SQL_NULL); - goto next_part; + is_null = TRUE; } } - row_mysql_store_col_in_innobase_format( - dfield, buf, key_ptr + offset, len, - dfield_get_type(dfield)->mtype, + type = dfield_get_type(dfield)->mtype; + + /* Calculate data length and data field total length */ + + if (type == DATA_BLOB) { + /* The key field is a column prefix of a BLOB or + TEXT type column */ + + ut_a(field->prefix_len > 0); + + /* MySQL stores the actual data length to the first 2 + bytes after the optional SQL NULL marker byte. The + storage format is little-endian. */ + + /* There are no key fields > 255 bytes currently in + MySQL */ + if (key_ptr[data_offset + 1] != 0) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: BLOB or TEXT prefix > 255 bytes in query to table %s\n", + index->table_name); + } + + data_len = key_ptr[data_offset]; + data_field_len = data_offset + 2 + field->prefix_len; + data_offset += 2; + + type = DATA_CHAR; /* now that we know the length, we + store the column value like it would + be a fixed char field */ + } else if (field->prefix_len > 0) { + data_len = field->prefix_len; + data_field_len = data_offset + data_len; + } else { + data_len = dfield_get_type(dfield)->len; + data_field_len = data_offset + data_len; + } + + /* Storing may use at most data_len bytes of buf */ + + if (!is_null) { + row_mysql_store_col_in_innobase_format( + dfield, buf, key_ptr + data_offset, + data_len, type, dfield_get_type(dfield)->prtype & DATA_UNSIGNED); - next_part: - key_ptr += (offset + len); + buf += data_len; + } + + key_ptr += data_field_len; if (key_ptr > key_end) { - /* The last field in key was not a complete - field but a prefix of it. + /* The last field in key was not a complete key field + but a prefix of it. - Print a warning about this! HA_READ_PREFIX_LAST - does not currently work in InnoDB with partial-field - key value prefixes. Since MySQL currently uses a - padding trick to calculate LIKE 'abc%' type queries - there should never be partial-field prefixes - in searches. */ + Print a warning about this! HA_READ_PREFIX_LAST does + not currently work in InnoDB with partial-field key + value prefixes. Since MySQL currently uses a padding + trick to calculate LIKE 'abc%' type queries there + should never be partial-field prefixes in searches. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: using a partial-field key prefix in search\n"); - ut_ad(dfield_get_len(dfield) != UNIV_SQL_NULL); - - dfield_set_data(dfield, buf, - len - (ulint)(key_ptr - key_end)); + if (!is_null) { + dfield->len -= (ulint)(key_ptr - key_end); + } } - buf += len; - + n_fields++; + field++; dfield++; } - /* We set the length of tuple to n_fields: we assume that - the memory area allocated for it is big enough (usually - bigger than n_fields). */ + ut_a(buf <= original_buf + buf_len); + + /* We set the length of tuple to n_fields: we assume that the memory + area allocated for it is big enough (usually bigger than n_fields). */ dtuple_set_n_fields(tuple, n_fields); } /****************************************************************** Stores the row id to the prebuilt struct. */ -UNIV_INLINE +static void row_sel_store_row_id_to_prebuilt( /*=============================*/ @@ -1970,11 +2060,22 @@ row_sel_store_row_id_to_prebuilt( { byte* data; ulint len; + char err_buf[1000]; data = rec_get_nth_field(index_rec, dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); - ut_a(len == DATA_ROW_ID_LEN); + if (len != DATA_ROW_ID_LEN) { + rec_sprintf(err_buf, 900, index_rec); + + fprintf(stderr, +"InnoDB: Error: Row id field is wrong length %lu in table %s index %s\n" +"InnoDB: Field number %lu, record:\n%s\n", + len, index->table_name, index->name, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), + err_buf); + ut_a(0); + } ut_memcpy(prebuilt->row_id, data, len); } @@ -2210,7 +2311,10 @@ row_sel_get_clust_rec_for_mysql( /* out: DB_SUCCESS or error code */ row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */ dict_index_t* sec_index,/* in: secondary index where rec resides */ - rec_t* rec, /* in: record in a non-clustered index */ + rec_t* rec, /* in: record in a non-clustered index; if + this is a locking read, then rec is not + allowed to be delete-marked, and that would + not make sense either */ que_thr_t* thr, /* in: query thread */ rec_t** out_rec,/* out: clustered record or an old version of it, NULL if the old version did not exist @@ -2226,7 +2330,7 @@ row_sel_get_clust_rec_for_mysql( ulint err; trx_t* trx; char err_buf[1000]; - + *out_rec = NULL; row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec); @@ -2239,26 +2343,47 @@ row_sel_get_clust_rec_for_mysql( clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); - if (!page_rec_is_user_rec(clust_rec)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: error clustered record for sec rec not found\n" - "InnoDB: index %s table %s\n", sec_index->name, - sec_index->table->name); + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ - rec_sprintf(err_buf, 900, rec); - fprintf(stderr, "InnoDB: sec index record %s\n", err_buf); + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(prebuilt->clust_pcur) + < dict_index_get_n_unique(clust_index)) { + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ - rec_sprintf(err_buf, 900, clust_rec); - fprintf(stderr, "InnoDB: clust index record %s\n", err_buf); + if (!rec_get_deleted_flag(rec) + || prebuilt->select_lock_type != LOCK_NONE) { - trx = thr_get_trx(thr); - trx_print(err_buf, trx); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: error clustered record for sec rec not found\n" + "InnoDB: index %s table %s\n", sec_index->name, + sec_index->table->name); - fprintf(stderr, - "%s\nInnoDB: Make a detailed bug report and send it\n", - err_buf); - fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); + rec_sprintf(err_buf, 900, rec); + fprintf(stderr, + "InnoDB: sec index record %s\n", err_buf); + + rec_sprintf(err_buf, 900, clust_rec); + fprintf(stderr, + "InnoDB: clust index record %s\n", err_buf); + + trx = thr_get_trx(thr); + trx_print(err_buf, trx); + + fprintf(stderr, + "%s\nInnoDB: Make a detailed bug report and send it\n", + err_buf); + fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n"); + } clust_rec = NULL; @@ -2936,8 +3061,6 @@ rec_loop: /*-------------------------------------------------------------*/ /* PHASE 4: Look for matching records in a loop */ - cons_read_requires_clust_rec = FALSE; - rec = btr_pcur_get_rec(pcur); /* printf("Using index %s cnt %lu ", index->name, cnt); @@ -3044,7 +3167,7 @@ rec_loop: if (prebuilt->select_lock_type != LOCK_NONE && set_also_gap_locks) { - /* Try to place a lock on the index record */ + /* Try to place a lock on the index record */ err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type, @@ -3092,6 +3215,8 @@ rec_loop: /* We are ready to look at a possible new index entry in the result set: the cursor is now placed on a user record */ + cons_read_requires_clust_rec = FALSE; + if (prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a lock on the index record; note that delete marked records are a special case in a unique search. If there @@ -3117,8 +3242,6 @@ rec_loop: /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - cons_read_requires_clust_rec = FALSE; - if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { /* Do nothing: we let a non-locking SELECT read the @@ -3162,7 +3285,7 @@ rec_loop: if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { - /* The record is delete marked: we can skip it if this is + /* The record is delete-marked: we can skip it if this is not a consistent read which might see an earlier version of a non-clustered index record */ @@ -3275,7 +3398,7 @@ got_row: goto normal_return; next_rec: - /*-------------------------------------------------------------*/ + /*-------------------------------------------------------------*/ /* PHASE 5: Move the cursor to the next index record */ if (mtr_has_extra_clust_latch) { diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index b84e55ca643..b22e494f891 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -428,7 +428,8 @@ row_undo_mod_del_unmark_sec( found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, &mtr); if (!found) { - fprintf(stderr, "InnoDB: error in sec index entry del undo in\n" + fprintf(stderr, + "InnoDB: error in sec index entry del undo in\n" "InnoDB: index %s table %s\n", index->name, index->table->name); dtuple_sprintf(err_buf, 900, entry); @@ -570,7 +571,7 @@ row_undo_mod_upd_exist_sec( the row */ row_upd_index_replace_new_col_vals(entry, index, - node->update); + node->update, NULL); row_undo_mod_del_unmark_sec(node, thr, index, entry); } diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 5fce1c1861b..db68479509d 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -72,8 +72,9 @@ searched delete is obviously to keep the x-latch for several steps of query graph execution. */ /*************************************************************** -Checks if an update vector changes some of the first fields of an index -record. */ +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. */ static ibool row_upd_changes_first_fields( @@ -234,7 +235,8 @@ row_upd_check_references_constraints( if (err != DB_SUCCESS) { if (got_s_lock) { - row_mysql_unfreeze_data_dictionary(trx); + row_mysql_unfreeze_data_dictionary( + trx); } mem_heap_free(heap); @@ -350,14 +352,15 @@ row_upd_index_entry_sys_field( } /*************************************************************** -Returns TRUE if row update changes size of some field in index -or if some field to be updated is stored externally in rec or update. */ +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. */ ibool -row_upd_changes_field_size( -/*=======================*/ +row_upd_changes_field_size_or_external( +/*===================================*/ /* out: TRUE if the update changes the size of - some field in index */ + some field in index or the field is external + in rec or update */ rec_t* rec, /* in: record in clustered index */ dict_index_t* index, /* in: clustered index */ upd_t* update) /* in: update vector */ @@ -820,69 +823,55 @@ void row_upd_index_replace_new_col_vals( /*===============================*/ dtuple_t* entry, /* in/out: index entry where replaced */ - dict_index_t* index, /* in: index; NOTE that may also be a + dict_index_t* index, /* in: index; NOTE that this may also be a non-clustered index */ - upd_t* update) /* in: update vector */ + upd_t* update, /* in: update vector */ + mem_heap_t* heap) /* in: memory heap to which we allocate and + copy the new values, set this as NULL if you + do not want allocation */ { + dict_field_t* field; upd_field_t* upd_field; dfield_t* dfield; dfield_t* new_val; - ulint field_no; - dict_index_t* clust_index; + ulint j; ulint i; ut_ad(index); - clust_index = dict_table_get_first_index(index->table); - dtuple_set_info_bits(entry, update->info_bits); - for (i = 0; i < upd_get_n_fields(update); i++) { - - upd_field = upd_get_nth_field(update, i); - - field_no = dict_index_get_nth_col_pos(index, - dict_index_get_nth_col_no(clust_index, - upd_field->field_no)); - if (field_no != ULINT_UNDEFINED) { - dfield = dtuple_get_nth_field(entry, field_no); + for (j = 0; j < dict_index_get_n_fields(index); j++) { - new_val = &(upd_field->new_val); + field = dict_index_get_nth_field(index, j); - dfield_set_data(dfield, new_val->data, new_val->len); - } - } -} - -/*************************************************************** -Replaces the new column values stored in the update vector to the -clustered index entry given. */ - -void -row_upd_clust_index_replace_new_col_vals( -/*=====================================*/ - dtuple_t* entry, /* in/out: index entry where replaced */ - upd_t* update) /* in: update vector */ -{ - upd_field_t* upd_field; - dfield_t* dfield; - dfield_t* new_val; - ulint field_no; - ulint i; + for (i = 0; i < upd_get_n_fields(update); i++) { - dtuple_set_info_bits(entry, update->info_bits); + upd_field = upd_get_nth_field(update, i); - for (i = 0; i < upd_get_n_fields(update); i++) { + if (upd_field->field_no == field->col->clust_pos) { - upd_field = upd_get_nth_field(update, i); + dfield = dtuple_get_nth_field(entry, j); - field_no = upd_field->field_no; + new_val = &(upd_field->new_val); - dfield = dtuple_get_nth_field(entry, field_no); + dfield_set_data(dfield, new_val->data, + new_val->len); + if (heap && new_val->len != UNIV_SQL_NULL) { + dfield->data = mem_heap_alloc(heap, + new_val->len); + ut_memcpy(dfield->data, new_val->data, + new_val->len); + } - new_val = &(upd_field->new_val); + if (field->prefix_len > 0 + && new_val->len != UNIV_SQL_NULL + && new_val->len > field->prefix_len) { - dfield_set_data(dfield, new_val->data, new_val->len); + dfield->len = field->prefix_len; + } + } + } } } @@ -931,9 +920,15 @@ row_upd_changes_ord_field_binary( upd_field = upd_get_nth_field(update, j); + /* Note that if the index field is a column prefix + then it may be that row does not contain an externally + stored part of the column value, and we cannot compare + the datas */ + if (col_pos == upd_field->field_no - && (row == NULL - || !dfield_datas_are_binary_equal( + && (row == NULL + || ind_field->prefix_len > 0 + || !dfield_datas_are_binary_equal( dtuple_get_nth_field(row, col_no), &(upd_field->new_val)))) { return(TRUE); @@ -978,8 +973,9 @@ row_upd_changes_some_index_ord_field_binary( } /*************************************************************** -Checks if an update vector changes some of the first fields of an index -record. */ +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. */ static ibool row_upd_changes_first_fields( @@ -1013,9 +1009,10 @@ row_upd_changes_first_fields( upd_field = upd_get_nth_field(update, j); if (col_pos == upd_field->field_no - && cmp_dfield_dfield( + && (ind_field->prefix_len > 0 + || 0 != cmp_dfield_dfield( dtuple_get_nth_field(entry, i), - &(upd_field->new_val))) { + &(upd_field->new_val)))) { return(TRUE); } } @@ -1204,7 +1201,7 @@ close_cur: } /* Build a new index entry */ - row_upd_index_replace_new_col_vals(entry, index, node->update); + row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); /* Insert new index entry */ err = row_ins_index_entry(index, entry, NULL, 0, thr); @@ -1317,12 +1314,12 @@ row_upd_clust_rec_by_insert( entry = row_build_index_entry(node->row, index, heap); - row_upd_clust_index_replace_new_col_vals(entry, node->update); + row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); /* If we return from a lock wait, for example, we may have - extern fields marked as not-owned in entry (marked if the + extern fields marked as not-owned in entry (marked in the if-branch above). We must unmark them. */ btr_cur_unmark_dtuple_extern_fields(entry, node->ext_vec, @@ -1702,9 +1699,9 @@ function_exit: /* Do some cleanup */ if (node->row != NULL) { - mem_heap_empty(node->heap); node->row = NULL; node->n_ext_vec = 0; + mem_heap_empty(node->heap); } node->state = UPD_NODE_UPDATE_CLUSTERED; diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c index cd8b18e5e12..d4a463d8a96 100644 --- a/innobase/row/row0vers.c +++ b/innobase/row/row0vers.c @@ -27,6 +27,7 @@ Created 2/6/1997 Heikki Tuuri #include "row0upd.h" #include "rem0cmp.h" #include "read0read.h" +#include "lock0lock.h" /********************************************************************* Finds out if an active transaction has inserted or modified a secondary @@ -58,7 +59,6 @@ row_vers_impl_x_locked_off_kernel( ibool rec_del; ulint err; mtr_t mtr; - char err_buf[1000]; ut_ad(mutex_own(&kernel_mutex)); ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); @@ -76,22 +76,20 @@ row_vers_impl_x_locked_off_kernel( clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr); if (!clust_rec) { - rec_sprintf(err_buf, 900, rec); - - ut_print_timestamp(stderr); - fprintf(stderr, -" InnoDB: Error: cannot find the clustered index record\n" -"InnoDB: for a secondary index record in table %s index %s.\n" -"InnoDB: Secondary index record %s.\n" -"InnoDB: The table is probably corrupt. Please run CHECK TABLE on it.\n" -"InnoDB: You can try to repair the table by dump + drop + reimport.\n" -"InnoDB: Send a detailed bug report to mysql@lists.mysql.com.\n", - index->table_name, index->name, err_buf); - mutex_enter(&kernel_mutex); - mtr_commit(&mtr); - - /* We assume there is no lock on the record, though this - is not certain because the table is apparently corrupt */ + /* In a rare case it is possible that no clust rec is found + for a secondary index record: if in row0umod.c + row_undo_mod_remove_clust_low() we have already removed the + clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case there cannot be + any implicit lock on the secondary index record, because + an active transaction which has modified the secondary index + record has also modified the clustered index record. And in + a rollback we always undo the modifications to secondary index + records before the clustered index record. */ + + mutex_enter(&kernel_mutex); + mtr_commit(&mtr); return(NULL); } @@ -111,6 +109,14 @@ row_vers_impl_x_locked_off_kernel( return(NULL); } + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, TRUE)) { + /* Corruption noticed: try to avoid a crash by returning */ + + mtr_commit(&mtr); + + return(NULL); + } + /* We look up if some earlier version of the clustered index record would require rec to be in a different state (delete marked or unmarked, or not existing). If there is such a version, then rec was @@ -177,7 +183,8 @@ row_vers_impl_x_locked_off_kernel( /* If we get here, we know that the trx_id transaction is still active and it has modified prev_version. Let us check - if prev_version would require rec to be in a different state. */ + if prev_version would require rec to be in a different + state. */ vers_del = rec_get_deleted_flag(prev_version); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index 90331157289..a886cbee22a 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -298,6 +298,7 @@ ulint srv_test_n_mutexes = ULINT_MAX; i/o handler thread */ char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS]; +char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; time_t srv_last_monitor_time; @@ -1750,7 +1751,7 @@ srv_conc_enter_innodb( trx_t* trx) /* in: transaction object associated with the thread */ { - ibool has_slept = FALSE; + ibool has_slept = FALSE; srv_conc_slot_t* slot; ulint i; char err_buf[1000]; @@ -1769,9 +1770,9 @@ srv_conc_enter_innodb( return; } -retry: - os_fast_mutex_lock(&srv_conc_mutex); + os_fast_mutex_lock(&srv_conc_mutex); +retry: if (trx->declared_to_be_inside_innodb) { ut_print_timestamp(stderr); @@ -1780,6 +1781,9 @@ retry: fprintf(stderr, " InnoDB: Error: trying to declare trx to enter InnoDB, but\n" "InnoDB: it already is declared.\n%s\n", err_buf); + os_fast_mutex_unlock(&srv_conc_mutex); + + return; } if (srv_conc_n_threads < (lint)srv_thread_concurrency) { @@ -1793,21 +1797,31 @@ retry: return; } - /* If the transaction is not holding resources, let it sleep - for 100 milliseconds, and try again then */ - + /* If the transaction is not holding resources, let it sleep for 50 + milliseconds, and try again then */ + if (!has_slept && !trx->has_search_latch && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) { - has_slept = TRUE; /* We let is sleep only once to avoid - starvation */ + has_slept = TRUE; /* We let is sleep only once to avoid + starvation */ + + srv_conc_n_waiting_threads++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + trx->op_info = (char*)"sleeping before joining InnoDB queue"; + + os_thread_sleep(50000); - os_fast_mutex_unlock(&srv_conc_mutex); + trx->op_info = (char*)""; - os_thread_sleep(100000); + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_waiting_threads--; goto retry; - } + } /* Too many threads inside: put the current thread to a queue */ @@ -2131,7 +2145,8 @@ srv_suspend_mysql_thread( os_event_t event; double wait_time; trx_t* trx; - ibool had_dict_lock = FALSE; + ibool had_dict_lock = FALSE; + ibool was_declared_inside_innodb = FALSE; ut_ad(!mutex_own(&kernel_mutex)); @@ -2179,11 +2194,16 @@ srv_suspend_mysql_thread( mutex_exit(&kernel_mutex); - /* We must declare this OS thread to exit InnoDB, since a possible - other thread holding a lock which this thread waits for must be - allowed to enter, sooner or later */ + if (trx->declared_to_be_inside_innodb) { + + was_declared_inside_innodb = TRUE; + + /* We must declare this OS thread to exit InnoDB, since a + possible other thread holding a lock which this thread waits + for must be allowed to enter, sooner or later */ - srv_conc_force_exit_innodb(thr_get_trx(thr)); + srv_conc_force_exit_innodb(trx); + } /* Release possible foreign key check latch */ if (trx->dict_operation_lock_mode == RW_S_LATCH) { @@ -2204,9 +2224,12 @@ srv_suspend_mysql_thread( row_mysql_freeze_data_dictionary(trx); } - /* Return back inside InnoDB */ + if (was_declared_inside_innodb) { + + /* Return back inside InnoDB */ - srv_conc_force_enter_innodb(thr_get_trx(thr)); + srv_conc_force_enter_innodb(trx); + } mutex_enter(&kernel_mutex); @@ -2302,6 +2325,7 @@ srv_sprintf_innodb_monitor( char* buf_end = buf + len - 2000; double time_elapsed; time_t current_time; + ulint n_reserved; mutex_enter(&srv_innodb_monitor_mutex); @@ -2429,12 +2453,21 @@ srv_sprintf_innodb_monitor( "ROW OPERATIONS\n" "--------------\n"); buf += sprintf(buf, - "%ld queries inside InnoDB, %ld queries in queue\n", + "%ld queries inside InnoDB, %lu queries in queue\n", srv_conc_n_threads, srv_conc_n_waiting_threads); + + n_reserved = fil_space_get_n_reserved_extents(0); + if (n_reserved > 0) { + buf += sprintf(buf, + "%lu tablespace extents now reserved for B-tree split operations\n", + n_reserved); + } + #ifdef UNIV_LINUX buf += sprintf(buf, - "Main thread process no %lu, state: %s\n", + "Main thread process no. %lu, id %lu, state: %s\n", srv_main_thread_process_no, + srv_main_thread_id, srv_main_thread_op_info); #else buf += sprintf(buf, @@ -2498,8 +2531,8 @@ srv_lock_timeout_and_monitor_thread( ulint i; #ifdef UNIV_DEBUG_THREAD_CREATION - printf("Lock timeout thread starts\n"); - printf("Thread id %lu\n", os_thread_pf(os_thread_get_curr_id())); + printf("Lock timeout thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); #endif UT_NOT_USED(arg); srv_last_monitor_time = time(NULL); @@ -2671,8 +2704,8 @@ srv_error_monitor_thread( UT_NOT_USED(arg); #ifdef UNIV_DEBUG_THREAD_CREATION - printf("Error monitor thread starts\n"); - printf("Thread id %lu\n", os_thread_pf(os_thread_get_curr_id())); + printf("Error monitor thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); #endif loop: srv_error_monitor_active = TRUE; @@ -2794,8 +2827,8 @@ srv_master_thread( UT_NOT_USED(arg); #ifdef UNIV_DEBUG_THREAD_CREATION - printf("Master thread starts\n"); - printf("Thread id %lu\n", os_thread_pf(os_thread_get_curr_id())); + printf("Master thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); #endif srv_main_thread_process_no = os_proc_get_number(); srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); @@ -2868,7 +2901,7 @@ loop: at transaction commit */ srv_main_thread_op_info = (char*)"flushing log"; - log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); + log_buffer_flush_to_disk(); /* If there were less than 5 i/os during the one second sleep, we assume that there is free @@ -2884,10 +2917,9 @@ loop: (char*)"doing insert buffer merge"; ibuf_contract_for_n_pages(TRUE, 5); - srv_main_thread_op_info = - (char*)"flushing log"; - log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, - TRUE); + srv_main_thread_op_info = (char*)"flushing log"; + + log_buffer_flush_to_disk(); } if (buf_get_modified_ratio_pct() > @@ -2937,7 +2969,7 @@ loop: buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); srv_main_thread_op_info = (char*) "flushing log"; - log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); + log_buffer_flush_to_disk(); } /* We run a batch of insert buffer merge every 10 seconds, @@ -2947,7 +2979,7 @@ loop: ibuf_contract_for_n_pages(TRUE, 5); srv_main_thread_op_info = (char*)"flushing log"; - log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE); + log_buffer_flush_to_disk(); /* We run a full purge every 10 seconds, even if the server were active */ @@ -2971,8 +3003,7 @@ loop: if (difftime(current_time, last_flush_time) > 1) { srv_main_thread_op_info = (char*) "flushing log"; - log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, - TRUE); + log_buffer_flush_to_disk(); last_flush_time = current_time; } } @@ -3043,10 +3074,29 @@ background_loop: srv_main_thread_op_info = (char*)"purging"; - if (srv_fast_shutdown && srv_shutdown_state > 0) { - n_pages_purged = 0; - } else { - n_pages_purged = trx_purge(); + /* Run a full purge */ + + n_pages_purged = 1; + + last_flush_time = time(NULL); + + while (n_pages_purged) { + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + break; + } + + srv_main_thread_op_info = (char*)"purging"; + n_pages_purged = trx_purge(); + + current_time = time(NULL); + + if (difftime(current_time, last_flush_time) > 1) { + srv_main_thread_op_info = (char*) "flushing log"; + + log_buffer_flush_to_disk(); + last_flush_time = current_time; + } } srv_main_thread_op_info = (char*)"reserving kernel mutex"; @@ -3092,6 +3142,10 @@ flush_loop: (char*) "waiting for buffer pool flush to end"; buf_flush_wait_batch_end(BUF_FLUSH_LIST); + srv_main_thread_op_info = (char*) "flushing log"; + + log_buffer_flush_to_disk(); + srv_main_thread_op_info = (char*)"making checkpoint"; log_checkpoint(TRUE, FALSE); diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index ad985d8282d..f0ff1167f4d 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -415,8 +415,8 @@ io_handler_thread( segment = *((ulint*)arg); #ifdef UNIV_DEBUG_THREAD_CREATION - printf("Io handler thread %lu starts\n", segment); - printf("Thread id %lu\n", os_thread_pf(os_thread_get_curr_id())); + printf("Io handler thread %lu starts, id %lu\n", segment, + os_thread_pf(os_thread_get_curr_id())); #endif for (i = 0;; i++) { fil_aio_wait(segment); @@ -1073,6 +1073,10 @@ innobase_start_or_create_for_mysql(void) srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; } else if (0 == ut_strcmp(srv_file_flush_method_str, + (char*)"O_DIRECT")) { + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, (char*)"littlesync")) { srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; @@ -1531,7 +1535,9 @@ innobase_shutdown_for_mysql(void) } /* 1. Flush buffer pool to disk, write the current lsn to - the tablespace header(s), and copy all log data to archive */ + the tablespace header(s), and copy all log data to archive. + The step 1 is the real InnoDB shutdown. The remaining steps + just free data structures after the shutdown. */ logs_empty_and_mark_files_at_shutdown(); diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 32615ce88ac..773b239189c 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -159,7 +159,7 @@ struct sync_thread_struct{ }; /* Number of slots reserved for each OS thread in the sync level array */ -#define SYNC_THREAD_N_LEVELS 10000 +#define SYNC_THREAD_N_LEVELS 250 struct sync_level_struct{ void* latch; /* pointer to a mutex or an rw-lock; NULL means that diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index d58240d3c11..fa9c287b0ad 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -593,7 +593,7 @@ trx_purge_rseg_get_next_history_log( mutex_enter(&(rseg->mutex)); - ut_ad(rseg->last_page_no != FIL_NULL); + ut_a(rseg->last_page_no != FIL_NULL); purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1); purge_sys->purge_undo_no = ut_dulint_zero; @@ -606,16 +606,9 @@ trx_purge_rseg_get_next_history_log( log_hdr = undo_page + rseg->last_offset; seg_hdr = undo_page + TRX_UNDO_SEG_HDR; - if ((mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0) - && (mach_read_from_2(seg_hdr + TRX_UNDO_STATE) - == TRX_UNDO_TO_PURGE)) { - - /* This is the last log header on this page and the log - segment cannot be reused: we may increment the number of - pages handled */ + /* Increase the purge page count by one for every handled log */ - purge_sys->n_pages_handled++; - } + purge_sys->n_pages_handled++; prev_log_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index 05e179e06a5..9453189d598 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -272,8 +272,8 @@ trx_undo_page_report_insert( mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, ptr - undo_page); - /* Write the log entry to the REDO log of this change in the UNDO log */ - + /* Write the log entry to the REDO log of this change in the UNDO + log */ trx_undof_page_add_undo_rec_log(undo_page, first_free, ptr - undo_page, mtr); return(first_free); @@ -492,7 +492,8 @@ trx_undo_page_report_modify( /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; - /* Store first some general parameters to the undo log */ + /* Store first some general parameters to the undo log */ + if (update) { if (rec_get_deleted_flag(rec)) { type_cmpl = TRX_UNDO_UPD_DEL_REC; @@ -526,8 +527,7 @@ trx_undo_page_report_modify( /* Store the values of the system columns */ trx_id = dict_index_rec_get_sys_col(index, DATA_TRX_ID, rec); - roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); - + roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); len = mach_dulint_write_compressed(ptr, trx_id); ptr += len; @@ -632,7 +632,11 @@ trx_undo_page_report_modify( columns which occur as ordering fields in any index. This info is used in the purge of old versions where we use it to build and search the delete marked index records, to look if we can remove them from the - index tree. */ + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. But we always store at least + 384 first bytes locally to the clustered index record, which means + we can construct the column prefix fields in the index from the + stored data. */ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { @@ -1408,11 +1412,11 @@ trx_undo_prev_version_build( return(DB_ERROR); } - if (row_upd_changes_field_size(rec, index, update)) { - - entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); + if (row_upd_changes_field_size_or_external(rec, index, update)) { - row_upd_clust_index_replace_new_col_vals(entry, update); + entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, + heap); + row_upd_index_replace_new_col_vals(entry, index, update, heap); buf = mem_heap_alloc(heap, rec_get_converted_size(entry)); diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index a9f8c5ad22c..7d1b341221c 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -52,6 +52,11 @@ trx_general_rollback_for_mysql( que_thr_t* thr; roll_node_t* roll_node; + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + trx_start_if_not_started(trx); heap = mem_heap_create(512); @@ -89,6 +94,11 @@ trx_general_rollback_for_mysql( ut_a(trx->error_state == DB_SUCCESS); + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + return((int) trx->error_state); } @@ -110,20 +120,8 @@ trx_rollback_for_mysql( trx->op_info = (char *) "rollback"; - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - err = trx_general_rollback_for_mysql(trx, FALSE, NULL); - trx_mark_sql_stat_end(trx); - - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - trx->op_info = (char *) ""; return(err); @@ -147,26 +145,192 @@ trx_rollback_last_sql_stat_for_mysql( trx->op_info = (char *) "rollback of SQL statement"; - /* Tell Innobase server that there might be work for - utility threads: */ - - srv_active_wake_master_thread(); - err = trx_general_rollback_for_mysql(trx, TRUE, &(trx->last_sql_stat_start)); + /* The following call should not be needed, but we play safe: */ trx_mark_sql_stat_end(trx); - /* Tell Innobase server that there might be work for - utility threads: */ + trx->op_info = (char *) ""; + + return(err); +} - srv_active_wake_master_thread(); +/*********************************************************************** +Frees savepoint structs. */ - trx->op_info = (char *) ""; +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep) /* in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ +{ + trx_named_savept_t* next_savep; + + if (savep == NULL) { + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + } else { + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + while (savep != NULL) { + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + mem_free(savep->name); + mem_free(savep); + + savep = next_savep; + } +} + +/*********************************************************************** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + char* savepoint_name, /* in: savepoint name */ + ib_longlong* mysql_binlog_cache_pos) /* out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + ulint err; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + if (trx->conc_state == TRX_NOT_STARTED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: transaction has a savepoint %s though it is not started\n", + savep->name); + return(DB_ERROR); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = (char *) "rollback to a savepoint"; + + err = trx_general_rollback_for_mysql(trx, TRUE, &(savep->savept)); + + /* Store the current undo_no of the transaction so that we know where + to roll back if we have to roll back the next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = (char *) ""; + return(err); } /*********************************************************************** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. */ + +ulint +trx_savepoint_for_mysql( +/*====================*/ + /* out: always DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + char* savepoint_name, /* in: savepoint name */ + ib_longlong binlog_cache_pos) /* in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + ut_a(trx); + ut_a(savepoint_name); + + trx_start_if_not_started(trx); + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = mem_alloc(sizeof(trx_named_savept_t)); + + savep->name = mem_alloc(1 + ut_strlen(savepoint_name)); + ut_memcpy(savep->name, savepoint_name, 1 + ut_strlen(savepoint_name)); + + savep->savept = trx_savept_take(trx); + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Returns a transaction savepoint taken at this point in time. */ + +trx_savept_t +trx_savept_take( +/*============*/ + /* out: savepoint */ + trx_t* trx) /* in: transaction */ +{ + trx_savept_t savept; + + savept.least_undo_no = trx->undo_no; + + return(savept); +} + +/*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert undo log. If the transaction was not yet committed, then we roll it back. */ @@ -325,22 +489,6 @@ loop: goto loop; } - -/*********************************************************************** -Returns a transaction savepoint taken at this point in time. */ - -trx_savept_t -trx_savept_take( -/*============*/ - /* out: savepoint */ - trx_t* trx) /* in: transaction */ -{ - trx_savept_t savept; - - savept.least_undo_no = trx->undo_no; - - return(savept); -} /*********************************************************************** Creates an undo number array. */ diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index b9e4a9fea4b..0c0dbab708c 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -321,8 +321,8 @@ trx_sys_doublewrite_restore_corrupt_pages(void) for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { - space_id = mach_read_from_4(page + FIL_PAGE_SPACE); page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + space_id = 0; if (!fil_check_adress_in_tablespace(space_id, page_no)) { fprintf(stderr, diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 5753b5b338e..1ece349ec6c 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -86,6 +86,10 @@ trx_create( trx->start_time = time(NULL); trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->id = ut_dulint_zero; + trx->no = ut_dulint_max; + trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; @@ -135,6 +139,8 @@ trx_create( trx->lock_heap = mem_heap_create_in_buffer(256); UT_LIST_INIT(trx->trx_locks); + UT_LIST_INIT(trx->trx_savepoints); + trx->dict_operation_lock_mode = 0; trx->has_search_latch = FALSE; trx->search_latch_timeout = BTR_SEA_TIMEOUT; @@ -776,29 +782,53 @@ trx_commit_off_kernel( efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ - /* We now flush the log, as the transaction made changes to - the database, making the transaction committed on disk. It is - enough that any one of the log groups gets written to disk. */ - /*-------------------------------------*/ - /* Most MySQL users run with srv_flush_.. set to 0: */ - - if (srv_flush_log_at_trx_commit != 0) { - if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && srv_flush_log_at_trx_commit != 2 - && !trx->flush_log_later) { - - /* Write the log to the log files AND flush - them to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); - } else { - /* Write the log but do not flush it to disk */ - - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); - } - } + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under MySQL's binlog mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the binlog mutex. This is to make the + group commit algorithm to work. Otherwise, the MySQL binlog + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + } else if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_a(0); + } trx->commit_lsn = lsn; @@ -807,6 +837,9 @@ trx_commit_off_kernel( mutex_enter(&kernel_mutex); } + /* Free savepoints */ + trx_roll_savepoints_free(trx, NULL); + trx->conc_state = TRX_NOT_STARTED; trx->rseg = NULL; trx->undo_no = ut_dulint_zero; @@ -1492,21 +1525,37 @@ trx_commit_complete_for_mysql( /* out: 0 or error number */ trx_t* trx) /* in: trx handle */ { - ut_a(trx); + dulint lsn = trx->commit_lsn; + + ut_a(trx); + + trx->op_info = (char*)"flushing log"; - if (srv_flush_log_at_trx_commit == 1 - && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { - - trx->op_info = (char *) "flushing log"; + if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ - /* Flush the log files to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush them to + disk */ - log_write_up_to(trx->commit_lsn, LOG_WAIT_ONE_GROUP, TRUE); + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { - trx->op_info = (char *) ""; - } + /* Write the log but do not flush it to disk */ - return(0); + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_a(0); + } + + trx->op_info = (char*)""; + + return(0); } /************************************************************************** @@ -1575,6 +1624,13 @@ trx_print( } buf += sprintf(buf, "\n"); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + + buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n", + trx->n_mysql_tables_in_use, + trx->mysql_n_tables_locked); + } start_of_line = buf; diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c index 2609b8f5241..ebead6424c8 100644 --- a/innobase/ut/ut0mem.c +++ b/innobase/ut/ut0mem.c @@ -201,7 +201,7 @@ ut_free( } /************************************************************************** -Frees all allocated memory not freed yet. */ +Frees in shutdown all allocated memory not freed yet. */ void ut_free_all_mem(void) @@ -209,7 +209,7 @@ ut_free_all_mem(void) { ut_mem_block_t* block; - os_fast_mutex_lock(&ut_list_mutex); + os_fast_mutex_free(&ut_list_mutex); while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) { @@ -222,11 +222,11 @@ ut_free_all_mem(void) free(block); } - os_fast_mutex_unlock(&ut_list_mutex); - - ut_a(ut_total_allocated_memory == 0); - - os_fast_mutex_free(&ut_list_mutex); + if (ut_total_allocated_memory != 0) { + fprintf(stderr, +"InnoDB: Warning: after shutdown total allocated memory is %lu\n", + ut_total_allocated_memory); + } } /************************************************************************** diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index c503cda54b9..4ca113f40ad 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -54,6 +54,8 @@ ut_get_high32( ulint a) /* in: ulint */ { #if SIZEOF_LONG == 4 + UT_NOT_USED(a); + return 0; #else return(a >> 32); |