diff options
Diffstat (limited to 'innobase')
140 files changed, 11441 insertions, 3520 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index 81eb32467ad..27d798f925a 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -427,7 +427,8 @@ btr_page_free_for_ibuf( flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); - ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); } /****************************************************************** @@ -601,8 +602,8 @@ btr_page_get_father_for_rec( UT_LIST_GET_FIRST(tree->tree_indexes)->name); fprintf(stderr, ",\n" "InnoDB: father ptr page no %lu, child page no %lu\n", - btr_node_ptr_get_child_page_no(node_ptr), - buf_frame_get_page_no(page)); + (ulong) btr_node_ptr_get_child_page_no(node_ptr), + (ulong) buf_frame_get_page_no(page)); page_rec_print(page_rec_get_next(page_get_infimum_rec(page))); page_rec_print(node_ptr); @@ -883,7 +884,9 @@ btr_page_reorganize_low( "InnoDB: Error: page old data size %lu new data size %lu\n" "InnoDB: Error: page old max ins size %lu new max ins size %lu\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", - data_size1, data_size2, max_ins_size1, max_ins_size2); + (unsigned long) data_size1, (unsigned long) data_size2, + (unsigned long) max_ins_size1, + (unsigned long) max_ins_size2); } buf_frame_free(new_page); @@ -2224,7 +2227,8 @@ btr_print_recursive( ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", - btr_page_get_level(page, mtr), buf_frame_get_page_no(page)); + (ulong) btr_page_get_level(page, mtr), + (ulong) buf_frame_get_page_no(page)); page_print(page, width, width); @@ -2376,7 +2380,7 @@ btr_index_rec_validate( if (rec_get_n_fields(rec) != n) { btr_index_rec_validate_report(page, rec, index); fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", - rec_get_n_fields(rec), n); + (ulong) rec_get_n_fields(rec), (ulong) n); if (!dump_on_error) { @@ -2409,7 +2413,7 @@ btr_index_rec_validate( btr_index_rec_validate_report(page, rec, index); fprintf(stderr, "InnoDB: field %lu len is %lu, should be %lu\n", - i, len, dtype_get_fixed_size(type)); + (ulong) i, (ulong) len, (ulong) dtype_get_fixed_size(type)); if (!dump_on_error) { @@ -2639,7 +2643,7 @@ loop: fprintf(stderr, "\n" "InnoDB: node ptr child page n:o %lu\n", - btr_node_ptr_get_child_page_no(node_ptr)); + (unsigned long) btr_node_ptr_get_child_page_no(node_ptr)); fputs("InnoDB: record on page ", stderr); rec_print(stderr, diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index b07b06765e6..be201da4510 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -300,6 +300,7 @@ btr_cur_search_to_nth_level( && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ && !estimate && mode != PAGE_CUR_LE_OR_EXTENDS + && srv_use_adaptive_hash_indexes && btr_search_guess_on_hash(index, info, tuple, mode, latch_mode, cursor, has_search_latch, mtr)) { @@ -398,7 +399,7 @@ btr_cur_search_to_nth_level( retry_page_get: page = buf_page_get_gen(space, page_no, rw_latch, guess, buf_mode, - IB__FILE__, __LINE__, + __FILE__, __LINE__, mtr); if (page == NULL) { /* This must be a search to perform an insert; @@ -509,9 +510,11 @@ retry_page_get: cursor->up_bytes = up_bytes; #ifdef BTR_CUR_ADAPT - btr_search_info_update(index, cursor); -#endif + if (srv_use_adaptive_hash_indexes) { + btr_search_info_update(index, cursor); + } +#endif ut_ad(cursor->up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); ut_ad(cursor->up_match != ULINT_UNDEFINED @@ -577,7 +580,7 @@ btr_cur_open_at_index_side( for (;;) { page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL, BUF_GET, - IB__FILE__, __LINE__, + __FILE__, __LINE__, mtr); ut_ad(0 == ut_dulint_cmp(tree->id, btr_page_get_index_id(page))); @@ -686,7 +689,7 @@ btr_cur_open_at_rnd_pos( for (;;) { page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL, BUF_GET, - IB__FILE__, __LINE__, + __FILE__, __LINE__, mtr); ut_ad(0 == ut_dulint_cmp(tree->id, btr_page_get_index_id(page))); @@ -981,7 +984,7 @@ calculate_sizes_again: /* Now, try the insert */ *rec = page_cur_insert_rec_low(page_cursor, entry, data_size, - NULL, mtr); + NULL, mtr); if (!(*rec)) { /* If the record did not fit, reorganize */ btr_page_reorganize(page, mtr); @@ -1000,7 +1003,7 @@ calculate_sizes_again: fputs(" into ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, "\nInnoDB: max insert size %lu\n", - max_size); + (ulong) max_size); ut_error; } } @@ -1068,6 +1071,7 @@ btr_cur_pessimistic_insert( ibool dummy_inh; ibool success; ulint n_extents = 0; + ulint n_reserved; ut_ad(dtuple_check_typed(entry)); @@ -1087,7 +1091,7 @@ btr_cur_pessimistic_insert( cursor->flag = BTR_CUR_BINARY; err = btr_cur_optimistic_insert(flags, cursor, entry, rec, big_rec, - thr, mtr); + thr, mtr); if (err != DB_FAIL) { return(err); @@ -1110,7 +1114,7 @@ btr_cur_pessimistic_insert( n_extents = cursor->tree_height / 16 + 3; - success = fsp_reserve_free_extents(index->space, + success = fsp_reserve_free_extents(&n_reserved, index->space, n_extents, FSP_NORMAL, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -1132,7 +1136,7 @@ btr_cur_pessimistic_insert( if (n_extents > 0) { fil_space_release_free_extents(index->space, - n_extents); + n_reserved); } return(DB_TOO_BIG_RECORD); } @@ -1160,7 +1164,7 @@ btr_cur_pessimistic_insert( err = DB_SUCCESS; if (n_extents > 0) { - fil_space_release_free_extents(index->space, n_extents); + fil_space_release_free_extents(index->space, n_reserved); } *big_rec = big_rec_vec; @@ -1341,7 +1345,8 @@ btr_cur_parse_update_in_place( } /***************************************************************** -Updates a record when the update causes no size changes in its fields. */ +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. */ ulint btr_cur_update_in_place( @@ -1681,6 +1686,7 @@ btr_cur_pessimistic_update( ibool was_first; ibool success; ulint n_extents = 0; + ulint n_reserved; ulint* ext_vect; ulint n_ext_vect; ulint reserve_flag; @@ -1726,7 +1732,8 @@ btr_cur_pessimistic_update( reserve_flag = FSP_NORMAL; } - success = fsp_reserve_free_extents(cursor->index->space, + success = fsp_reserve_free_extents(&n_reserved, + cursor->index->space, n_extents, reserve_flag, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -1875,7 +1882,7 @@ return_after_reservations: if (n_extents > 0) { fil_space_release_free_extents(cursor->index->space, - n_extents); + n_reserved); } *big_rec = big_rec_vec; @@ -2339,6 +2346,7 @@ btr_cur_pessimistic_delete( rec_t* rec; dtuple_t* node_ptr; ulint n_extents = 0; + ulint n_reserved; ibool success; ibool ret = FALSE; mem_heap_t* heap; @@ -2357,7 +2365,8 @@ btr_cur_pessimistic_delete( n_extents = cursor->tree_height / 32 + 1; - success = fsp_reserve_free_extents(cursor->index->space, + success = fsp_reserve_free_extents(&n_reserved, + cursor->index->space, n_extents, FSP_CLEANING, mtr); if (!success) { *err = DB_OUT_OF_FILE_SPACE; @@ -2436,7 +2445,8 @@ return_after_reservations: } if (n_extents > 0) { - fil_space_release_free_extents(cursor->index->space, n_extents); + fil_space_release_free_extents(cursor->index->space, + n_reserved); } return(ret); @@ -2934,7 +2944,7 @@ btr_cur_mark_dtuple_inherited_extern( if (!is_updated) { dfield = dtuple_get_nth_field(entry, ext_vec[i]); - data = dfield_get_data(dfield); + data = (byte*) dfield_get_data(dfield); len = dfield_get_len(dfield); len -= BTR_EXTERN_FIELD_REF_SIZE; @@ -2994,7 +3004,7 @@ btr_cur_unmark_dtuple_extern_fields( for (i = 0; i < n_ext_vec; i++) { dfield = dtuple_get_nth_field(entry, ext_vec[i]); - data = dfield_get_data(dfield); + data = (byte*) dfield_get_data(dfield); len = dfield_get_len(dfield); len -= BTR_EXTERN_FIELD_REF_SIZE; @@ -3132,7 +3142,7 @@ btr_store_big_rec_extern_fields( ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), - MTR_MEMO_PAGE_X_FIX)); + MTR_MEMO_PAGE_X_FIX)); ut_a(index->type & DICT_CLUSTERED); space_id = buf_frame_get_space_id(rec); @@ -3300,7 +3310,7 @@ btr_free_externally_stored_field( ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data), - MTR_MEMO_PAGE_X_FIX)); + MTR_MEMO_PAGE_X_FIX)); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index 4725551d4d7..cf8a612ef28 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -95,7 +95,9 @@ btr_pcur_store_position( ut_a(cursor->latch_mode != BTR_NO_LATCHES); if (page_get_n_recs(page) == 0) { - /* It must be an empty index tree */ + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ ut_a(btr_page_get_next(page, mtr) == FIL_NULL && btr_page_get_prev(page, mtr) == FIL_NULL); @@ -128,12 +130,13 @@ btr_pcur_store_position( } else { cursor->rel_pos = BTR_PCUR_ON; } - + cursor->old_stored = BTR_PCUR_OLD_STORED; cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec, &(cursor->old_rec_buf), &(cursor->buf_size)); + cursor->block_when_stored = buf_block_align(page); cursor->modify_clock = buf_frame_get_modify_clock(page); } @@ -205,6 +208,9 @@ btr_pcur_restore_position( if (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + /* In these cases we do not try an optimistic restoration, + but always do a search */ + if (cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { from_left = TRUE; } else { @@ -214,6 +220,10 @@ btr_pcur_restore_position( btr_cur_open_at_index_side(from_left, btr_pcur_get_btr_cur(cursor)->index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr); + + cursor->block_when_stored = + buf_block_align(btr_pcur_get_page(cursor)); + return(FALSE); } @@ -224,8 +234,9 @@ btr_pcur_restore_position( if (latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF) { /* Try optimistic restoration */ - if (buf_page_optimistic_get(latch_mode, page, - cursor->modify_clock, mtr)) { + if (buf_page_optimistic_get(latch_mode, + cursor->block_when_stored, page, + cursor->modify_clock, mtr)) { cursor->pos_state = BTR_PCUR_IS_POSITIONED; #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_TREE_NODE); @@ -270,8 +281,6 @@ btr_pcur_restore_position( btr_pcur_open_with_no_init(btr_pcur_get_btr_cur(cursor)->index, tuple, mode, latch_mode, cursor, 0, mtr); - - cursor->old_stored = BTR_PCUR_OLD_STORED; /* Restore the old search mode */ cursor->search_mode = old_mode; @@ -280,11 +289,18 @@ btr_pcur_restore_position( && btr_pcur_is_on_user_rec(cursor, mtr) && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) { - /* We have to store the NEW value for the modify clock, since - the cursor can now be on a different page! */ + /* We have to store the NEW value for the modify clock, since + the cursor can now be on a different page! But we can retain + the value of old_rec */ + + cursor->modify_clock = + buf_frame_get_modify_clock(btr_pcur_get_page(cursor)); + + cursor->block_when_stored = + buf_block_align(btr_pcur_get_page(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_STORED; - cursor->modify_clock = buf_frame_get_modify_clock( - buf_frame_align(btr_pcur_get_rec(cursor))); mem_heap_free(heap); return(TRUE); @@ -292,9 +308,10 @@ btr_pcur_restore_position( mem_heap_free(heap); - /* We have to store position information, modify clock value, etc. - because the cursor may now be on a different page */ - + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + btr_pcur_store_position(cursor, mtr); return(FALSE); diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c index e01ce94dc97..9384168df88 100644 --- a/innobase/btr/btr0sea.c +++ b/innobase/btr/btr0sea.c @@ -764,7 +764,7 @@ btr_search_guess_on_hash( success = buf_page_get_known_nowait(latch_mode, page, BUF_MAKE_YOUNG, - IB__FILE__, __LINE__, + __FILE__, __LINE__, mtr); rw_lock_s_unlock(&btr_search_latch); @@ -792,8 +792,8 @@ btr_search_guess_on_hash( goto failure; } - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); - ut_ad(page_rec_is_user_rec(rec)); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(page_rec_is_user_rec(rec)); btr_cur_position(index, rec, cursor); @@ -1042,12 +1042,14 @@ btr_search_drop_page_hash_when_freed( mtr_start(&mtr); - /* We assume that if the caller has a latch on the page, - then the caller has already dropped the hash index for the page, - and we never get here. Therefore we can acquire the s-latch to - the page without having to fear a deadlock. */ + /* We assume that if the caller has a latch on the page, then the + caller has already dropped the hash index for the page, and we never + get here. Therefore we can acquire the s-latch to the page without + having to fear a deadlock. */ - page = buf_page_get(space, page_no, RW_S_LATCH, &mtr); + page = buf_page_get_gen(space, page_no, RW_S_LATCH, NULL, + BUF_GET_IF_IN_POOL, __FILE__, __LINE__, + &mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_TREE_NODE_FROM_HASH); @@ -1565,22 +1567,24 @@ btr_search_validate(void) fprintf(stderr, " InnoDB: Error in an adaptive hash index pointer to page %lu\n" "ptr mem address %p index id %lu %lu, node fold %lu, rec fold %lu\n", - buf_frame_get_page_no(page), + (ulong) buf_frame_get_page_no(page), node->data, - ut_dulint_get_high(btr_page_get_index_id(page)), - ut_dulint_get_low(btr_page_get_index_id(page)), - node->fold, rec_fold((rec_t*)(node->data), - block->curr_n_fields, - block->curr_n_bytes, - btr_page_get_index_id(page))); + (ulong) ut_dulint_get_high(btr_page_get_index_id(page)), + (ulong) ut_dulint_get_low(btr_page_get_index_id(page)), + (ulong) node->fold, + (ulong) rec_fold((rec_t*)(node->data), + block->curr_n_fields, + block->curr_n_bytes, + btr_page_get_index_id(page))); fputs("InnoDB: Record ", stderr); rec_print(stderr, (rec_t*)(node->data)); fprintf(stderr, "\nInnoDB: on that page." "Page mem address %p, is hashed %lu, n fields %lu, n bytes %lu\n" "side %lu\n", - page, block->is_hashed, block->curr_n_fields, - block->curr_n_bytes, block->curr_side); + page, (ulong) block->is_hashed, + (ulong) block->curr_n_fields, + (ulong) block->curr_n_bytes, (ulong) block->curr_side); if (n_page_dumps < 20) { buf_page_print(page); diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index b744430a76e..5ec8998473d 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -197,7 +197,29 @@ If a new page is referenced in the buf_pool, and several pages of its random access area (for instance, 32 consecutive pages in a tablespace) have recently been referenced, we may predict that the whole area may be needed in the near future, and issue -the read requests for the whole area. */ +the read requests for the whole area. + + AWE implementation + ------------------ + +By a 'block' we mean the buffer header of type buf_block_t. By a 'page' +we mean the physical 16 kB memory area allocated from RAM for that block. +By a 'frame' we mean a 16 kB area in the virtual address space of the +process, in the frame_mem of buf_pool. + +We can map pages to the frames of the buffer pool. + +1) A buffer block allocated to use as a non-data page, e.g., to the lock +table, is always mapped to a frame. +2) A bufferfixed or io-fixed data page is always mapped to a frame. +3) When we need to map a block to frame, we look from the list +awe_LRU_free_mapped and try to unmap its last block, but note that +bufferfixed or io-fixed pages cannot be unmapped. +4) For every frame in the buffer pool there is always a block whose page is +mapped to it. When we create the buffer pool, we map the first elements +in the free list to the frames. +5) When we have AWE enabled, we disable adaptive hash indexes. +*/ buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ @@ -221,9 +243,10 @@ buf_calc_page_new_checksum( { ulint checksum; - /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO - are written outside the buffer pool to the first pages of data - files, we have to skip them in the page checksum calculation. + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + ..._ARCH_LOG_NO, are written outside the buffer pool to the first + pages of data files, we have to skip them in the page checksum + calculation. We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the checksum is stored, and also the last 8 bytes of page because there we store the old formula checksum. */ @@ -233,7 +256,7 @@ buf_calc_page_new_checksum( + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA - FIL_PAGE_END_LSN_OLD_CHKSUM); - checksum = checksum & 0xFFFFFFFF; + checksum = checksum & 0xFFFFFFFFUL; return(checksum); } @@ -256,7 +279,7 @@ buf_calc_page_old_checksum( checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - checksum = checksum & 0xFFFFFFFF; + checksum = checksum & 0xFFFFFFFFUL; return(checksum); } @@ -274,8 +297,9 @@ buf_page_is_corrupted( ulint old_checksum; ulint checksum_field; ulint old_checksum_field; +#ifndef UNIV_HOTBACKUP dulint current_lsn; - +#endif if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) != mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { @@ -297,13 +321,13 @@ buf_page_is_corrupted( " InnoDB: Error: page %lu log sequence number %lu %lu\n" "InnoDB: is in the future! Current system log sequence number %lu %lu.\n" "InnoDB: Your database may be corrupt.\n", - mach_read_from_4(read_buf + FIL_PAGE_OFFSET), - ut_dulint_get_high( + (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + (ulong) ut_dulint_get_high( mach_read_from_8(read_buf + FIL_PAGE_LSN)), - ut_dulint_get_low( + (ulong) ut_dulint_get_low( mach_read_from_8(read_buf + FIL_PAGE_LSN)), - ut_dulint_get_high(current_lsn), - ut_dulint_get_low(current_lsn)); + (ulong) ut_dulint_get_high(current_lsn), + (ulong) ut_dulint_get_low(current_lsn)); } } #endif @@ -362,16 +386,21 @@ buf_page_print( fprintf(stderr, " InnoDB: Page checksum %lu, prior-to-4.0.14-form checksum %lu\n" "InnoDB: stored checksum %lu, prior-to-4.0.14-form stored checksum %lu\n", - checksum, old_checksum, - mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), - mach_read_from_4(read_buf + UNIV_PAGE_SIZE + (ulong) checksum, (ulong) old_checksum, + (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM)); fprintf(stderr, - "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n", - mach_read_from_4(read_buf + FIL_PAGE_LSN), - mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), - mach_read_from_4(read_buf + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)); +"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n" +"InnoDB: Page number (if stored to page already) %lu,\n" +"InnoDB: space id (if created with >= MySQL-4.1.1 and stored already) %lu\n", + (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), + (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT) { fprintf(stderr, @@ -385,12 +414,9 @@ buf_page_print( if (fil_page_get_type(read_buf) == FIL_PAGE_INDEX) { fprintf(stderr, - "InnoDB: Page may be an index page "); - - fprintf(stderr, - "where index id is %lu %lu\n", - ut_dulint_get_high(btr_page_get_index_id(read_buf)), - ut_dulint_get_low(btr_page_get_index_id(read_buf))); +"InnoDB: Page may be an index page where index id is %lu %lu\n", + (ulong) ut_dulint_get_high(btr_page_get_index_id(read_buf)), + (ulong) ut_dulint_get_low(btr_page_get_index_id(read_buf))); /* If the code is in ibbackup, dict_sys may be uninitialized, i.e., NULL */ @@ -405,7 +431,6 @@ buf_page_print( fputs(")\n", stderr); } } - } else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) { fputs("InnoDB: Page may be an 'inode' page\n", stderr); } else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) { @@ -421,23 +446,29 @@ void buf_block_init( /*===========*/ buf_block_t* block, /* in: pointer to control block */ - byte* frame) /* in: pointer to buffer frame */ + byte* frame) /* in: pointer to buffer frame, or NULL if in + the case of AWE there is no frame */ { block->state = BUF_BLOCK_NOT_USED; block->frame = frame; + block->awe_info = NULL; + block->modify_clock = ut_dulint_zero; block->file_page_was_freed = FALSE; block->check_index_page_at_flush = FALSE; + block->in_free_list = FALSE; + block->in_LRU_list = FALSE; + + block->n_pointers = 0; + rw_lock_create(&(block->lock)); ut_ad(rw_lock_validate(&(block->lock))); - rw_lock_create(&(block->read_lock)); - rw_lock_set_level(&(block->read_lock), SYNC_NO_ORDER_CHECK); #ifdef UNIV_SYNC_DEBUG rw_lock_create(&(block->debug_latch)); rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK); @@ -445,25 +476,40 @@ buf_block_init( } /************************************************************************ -Creates a buffer buf_pool object. */ -static +Creates the buffer pool. */ + buf_pool_t* -buf_pool_create( -/*============*/ +buf_pool_init( +/*==========*/ /* out, own: buf_pool object, NULL if not - enough memory */ + enough memory or error */ ulint max_size, /* in: maximum size of the buf_pool in blocks */ - ulint curr_size) /* in: current size to use, must be <= + ulint curr_size, /* in: current size to use, must be <= max_size, currently must be equal to max_size */ + ulint n_frames) /* in: number of frames; if AWE is used, + this is the size of the address space window + where physical memory pages are mapped; if + AWE is not used then this must be the same + as max_size */ { byte* frame; ulint i; buf_block_t* block; ut_a(max_size == curr_size); + ut_a(srv_use_awe || n_frames == max_size); + if (n_frames > curr_size) { + fprintf(stderr, +"InnoDB: AWE: Error: you must specify in my.cnf .._awe_mem_mb larger\n" +"InnoDB: than .._buffer_pool_size. Now the former is %lu pages,\n" +"InnoDB: the latter %lu pages.\n", (ulong) curr_size, (ulong) n_frames); + + return(NULL); + } + buf_pool = mem_alloc(sizeof(buf_pool_t)); /* 1. Initialize general fields @@ -472,8 +518,38 @@ buf_pool_create( mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); mutex_enter(&(buf_pool->mutex)); - - buf_pool->frame_mem = ut_malloc(UNIV_PAGE_SIZE * (max_size + 1)); + + if (srv_use_awe) { + /*----------------------------------------*/ + /* Allocate the virtual address space window, i.e., the + buffer pool frames */ + + buf_pool->frame_mem = os_awe_allocate_virtual_mem_window( + UNIV_PAGE_SIZE * (n_frames + 1)); + + /* Allocate the physical memory for AWE and the AWE info array + for buf_pool */ + + if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) { + + fprintf(stderr, +"InnoDB: AWE: Error: physical memory must be allocated in full megabytes.\n" +"InnoDB: Trying to allocate %lu database pages.\n", + (ulong) curr_size); + + return(NULL); + } + + if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info), + curr_size / ((1024 * 1024) / UNIV_PAGE_SIZE))) { + + return(NULL); + } + /*----------------------------------------*/ + } else { + buf_pool->frame_mem = ut_malloc( + UNIV_PAGE_SIZE * (n_frames + 1)); + } if (buf_pool->frame_mem == NULL) { @@ -490,21 +566,60 @@ buf_pool_create( buf_pool->max_size = max_size; buf_pool->curr_size = curr_size; + buf_pool->n_frames = n_frames; + /* Align pointer to the first frame */ frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE); + buf_pool->frame_zero = frame; + buf_pool->high_end = frame + UNIV_PAGE_SIZE * n_frames; + + if (srv_use_awe) { + /*----------------------------------------*/ + /* Map an initial part of the allocated physical memory to + the window */ + + os_awe_map_physical_mem_to_window(buf_pool->frame_zero, + n_frames * + (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE), + buf_pool->awe_info); + /*----------------------------------------*/ + } + + buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames); + + if (buf_pool->blocks_of_frames == NULL) { - buf_pool->high_end = frame + UNIV_PAGE_SIZE * curr_size; + return(NULL); + } + + /* Init block structs and assign frames for them; in the case of + AWE there are less frames than blocks. Then we assign the frames + to the first blocks (we already mapped the memory above). We also + init the awe_info for every block. */ - /* Init block structs and assign frames for them */ for (i = 0; i < max_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); + + if (i < n_frames) { + frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE; + *(buf_pool->blocks_of_frames + i) = block; + } else { + frame = NULL; + } + buf_block_init(block, frame); - frame = frame + UNIV_PAGE_SIZE; + + if (srv_use_awe) { + /*----------------------------------------*/ + block->awe_info = buf_pool->awe_info + + i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE); + /*----------------------------------------*/ + } } - + buf_pool->page_hash = hash_create(2 * max_size); buf_pool->n_pend_reads = 0; @@ -514,12 +629,14 @@ buf_pool_create( buf_pool->n_pages_read = 0; buf_pool->n_pages_written = 0; buf_pool->n_pages_created = 0; - + buf_pool->n_pages_awe_remapped = 0; + buf_pool->n_page_gets = 0; buf_pool->n_page_gets_old = 0; buf_pool->n_pages_read_old = 0; buf_pool->n_pages_written_old = 0; buf_pool->n_pages_created_old = 0; + buf_pool->n_pages_awe_remapped_old = 0; /* 2. Initialize flushing fields ---------------------------- */ @@ -542,37 +659,124 @@ buf_pool_create( buf_pool->LRU_old = NULL; + UT_LIST_INIT(buf_pool->awe_LRU_free_mapped); + /* Add control blocks to the free list */ UT_LIST_INIT(buf_pool->free); + for (i = 0; i < curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); - UT_LIST_ADD_FIRST(free, buf_pool->free, block); + if (block->frame) { + /* Wipe contents of frame to eliminate a Purify + warning */ + +#ifdef HAVE_purify + memset(block->frame, '\0', UNIV_PAGE_SIZE); +#endif + if (srv_use_awe) { + /* Add to the list of blocks mapped to + frames */ + + UT_LIST_ADD_LAST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + } + + UT_LIST_ADD_LAST(free, buf_pool->free, block); + block->in_free_list = TRUE; } mutex_exit(&(buf_pool->mutex)); - btr_search_sys_create(curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + if (srv_use_adaptive_hash_indexes) { + btr_search_sys_create( + curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + } else { + /* Create only a small dummy system */ + btr_search_sys_create(1000); + } return(buf_pool); } /************************************************************************ -Initializes the buffer buf_pool of the database. */ +Maps the page of block to a frame, if not mapped yet. Unmaps some page +from the end of the awe_LRU_free_mapped. */ void -buf_pool_init( -/*==========*/ - ulint max_size, /* in: maximum size of the buf_pool in blocks */ - ulint curr_size) /* in: current size to use, must be <= - max_size */ +buf_awe_map_page_to_frame( +/*======================*/ + buf_block_t* block, /* in: block whose page should be + mapped to a frame */ + ibool add_to_mapped_list) /* in: TRUE if we in the case + we need to map the page should also + add the block to the + awe_LRU_free_mapped list */ { - ut_a(buf_pool == NULL); + buf_block_t* bck; - buf_pool_create(max_size, curr_size); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(buf_pool->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(block); + + if (block->frame) { + + return; + } - ut_ad(buf_validate()); + /* Scan awe_LRU_free_mapped from the end and try to find a block + which is not bufferfixed or io-fixed */ + + bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped); + + while (bck) { + if (bck->state == BUF_BLOCK_FILE_PAGE + && (bck->buf_fix_count != 0 || bck->io_fix != 0)) { + + /* We have to skip this */ + bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck); + } else { + /* We can map block to the frame of bck */ + + os_awe_map_physical_mem_to_window( + bck->frame, + UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE, + block->awe_info); + + block->frame = bck->frame; + + *(buf_pool->blocks_of_frames + + (((ulint)(block->frame + - buf_pool->frame_zero)) + >> UNIV_PAGE_SIZE_SHIFT)) + = block; + + bck->frame = NULL; + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, + bck); + + if (add_to_mapped_list) { + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, + block); + } + + buf_pool->n_pages_awe_remapped++; + + return; + } + } + + fprintf(stderr, +"InnoDB: AWE: Fatal error: cannot find a page to unmap\n" +"InnoDB: awe_LRU_free_mapped list length %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + + ut_a(0); } /************************************************************************ @@ -581,7 +785,9 @@ UNIV_INLINE buf_block_t* buf_block_alloc(void) /*=================*/ - /* out, own: the allocated block */ + /* out, own: the allocated block; also if AWE + is used it is guaranteed that the page is + mapped to a frame */ { buf_block_t* block; @@ -625,7 +831,7 @@ buf_page_make_young( block = buf_block_align(frame); - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); buf_LRU_make_block_young(block); @@ -640,7 +846,7 @@ buf_block_free( /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - ut_ad(block->state != BUF_BLOCK_FILE_PAGE); + ut_a(block->state != BUF_BLOCK_FILE_PAGE); mutex_enter(&(buf_pool->mutex)); @@ -842,7 +1048,7 @@ buf_page_get_gen( buf_frame_t* guess, /* in: guessed frame or NULL */ ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, BUF_GET_NO_LATCH, BUF_GET_NOWAIT */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr) /* in: mini-transaction */ { @@ -904,6 +1110,8 @@ loop: goto loop; } + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + must_read = FALSE; if (block->io_fix == BUF_IO_READ) { @@ -919,6 +1127,19 @@ loop: } } + /* If AWE is enabled and the page is not mapped to a frame, then + map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is in the + LRU list and we must put it to awe_LRU_free_mapped list once + mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + #ifdef UNIV_SYNC_DEBUG buf_block_buf_fix_inc_debug(block, file, line); #else @@ -973,8 +1194,26 @@ loop: } else if (rw_latch == RW_NO_LATCH) { if (must_read) { - rw_lock_x_lock(&(block->read_lock)); - rw_lock_x_unlock(&(block->read_lock)); + /* Let us wait until the read operation + completes */ + + for (;;) { + mutex_enter(&(buf_pool->mutex)); + + if (block->io_fix == BUF_IO_READ) { + + mutex_exit(&(buf_pool->mutex)); + + /* Sleep 20 milliseconds */ + + os_thread_sleep(20000); + } else { + + mutex_exit(&(buf_pool->mutex)); + + break; + } + } } fix_type = MTR_MEMO_BUF_FIX; @@ -1013,28 +1252,27 @@ buf_page_optimistic_get_func( /*=========================*/ /* out: TRUE if success */ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ - buf_frame_t* guess, /* in: guessed frame */ + buf_block_t* block, /* in: guessed buffer block */ + buf_frame_t* guess, /* in: guessed frame; note that AWE may move + frames */ dulint modify_clock,/* in: modify clock value if mode is ..._GUESS_ON_CLOCK */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr) /* in: mini-transaction */ { - buf_block_t* block; ibool accessed; ibool success; ulint fix_type; - ut_ad(mtr && guess); + ut_ad(mtr && block); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - - buf_pool->n_page_gets++; - - block = buf_block_align(guess); mutex_enter(&(buf_pool->mutex)); - if (block->state != BUF_BLOCK_FILE_PAGE) { + /* If AWE is used, block may have a different frame now, e.g., NULL */ + + if (block->state != BUF_BLOCK_FILE_PAGE || block->frame != guess) { mutex_exit(&(buf_pool->mutex)); @@ -1127,12 +1365,15 @@ buf_page_optimistic_get_func( #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif + buf_pool->n_page_gets++; + return(TRUE); } /************************************************************************ This is used to get access to a known database page, when no waiting can be -done. */ +done. For example, if a search in an adaptive hash index leads us to this +frame. */ ibool buf_page_get_known_nowait( @@ -1141,7 +1382,7 @@ buf_page_get_known_nowait( ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ buf_frame_t* guess, /* in: the known page frame */ ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr) /* in: mini-transaction */ { @@ -1151,13 +1392,11 @@ buf_page_get_known_nowait( ut_ad(mtr); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - - buf_pool->n_page_gets++; - - block = buf_block_align(guess); mutex_enter(&(buf_pool->mutex)); + block = buf_block_align(guess); + if (block->state == BUF_BLOCK_REMOVE_HASH) { /* Another thread is just freeing the block from the LRU list of the buffer pool: do not try to access this page; this @@ -1171,6 +1410,8 @@ buf_page_get_known_nowait( return(FALSE); } + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + #ifdef UNIV_SYNC_DEBUG buf_block_buf_fix_inc_debug(block, file, line); #else @@ -1225,6 +1466,8 @@ buf_page_get_known_nowait( ut_a((mode == BUF_KEEP_OLD) || (ibuf_count_get(block->space, block->offset) == 0)); #endif + buf_pool->n_page_gets++; + return(TRUE); } @@ -1281,7 +1524,7 @@ buf_page_init( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(block->state == BUF_BLOCK_READY_FOR_USE); + ut_a(block->state != BUF_BLOCK_FILE_PAGE); /* Set the state of the block */ block->magic_n = BUF_BLOCK_MAGIC_N; @@ -1297,6 +1540,20 @@ buf_page_init( /* Insert into the hash table of file pages */ + if (buf_page_hash_get(space, offset)) { + fprintf(stderr, +"InnoDB: Error: page %lu %lu already found from the hash table\n", + (ulong) space, + (ulong) offset); +#ifdef UNIV_DEBUG + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif /* UNIV_DEBUG */ + ut_a(0); + } + HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, buf_page_address_fold(space, offset), block); @@ -1320,25 +1577,35 @@ buf_page_init( /************************************************************************ Function which inits a page for read to the buffer buf_pool. If the page is -already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and -sets a non-recursive exclusive lock on the buffer frame. The io-handler must -take care that the flag is cleared and the lock released later. This is one -of the functions which perform the state transition NOT_USED => FILE_PAGE to -a block (the other is buf_page_create). */ +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. This is one of the functions which perform the +state transition NOT_USED => FILE_PAGE to a block (the other is +buf_page_create). */ buf_block_t* buf_page_init_for_read( /*===================*/ - /* out: pointer to the block or NULL */ - ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ - ulint space, /* in: space id */ - ulint offset) /* in: page number */ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ib_longlong tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset) /* in: page number */ { buf_block_t* block; mtr_t mtr; - + ut_ad(buf_pool); + *err = DB_SUCCESS; + if (mode == BUF_READ_IBUF_PAGES_ONLY) { /* It is a read-ahead within an ibuf routine */ @@ -1359,13 +1626,20 @@ buf_page_init_for_read( block = buf_block_alloc(); - ut_ad(block); + ut_a(block); mutex_enter(&(buf_pool->mutex)); - - if (NULL != buf_page_hash_get(space, offset)) { - /* The page is already in buf_pool, return */ + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, + tablespace_version)) { + *err = DB_TABLESPACE_DELETED; + } + + if (*err == DB_TABLESPACE_DELETED + || NULL != buf_page_hash_get(space, offset)) { + + /* The page belongs to a space which has been deleted or is + being deleted, or the page is already in buf_pool, return */ mutex_exit(&(buf_pool->mutex)); buf_block_free(block); @@ -1397,8 +1671,6 @@ buf_page_init_for_read( is completed. The x-lock is cleared by the io-handler thread. */ rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ); - - rw_lock_x_lock_gen(&(block->read_lock), BUF_IO_READ); mutex_exit(&(buf_pool->mutex)); @@ -1457,7 +1729,7 @@ buf_page_create( if (buf_debug_prints) { fprintf(stderr, "Creating space %lu page %lu to buffer\n", - space, offset); + (ulong) space, (ulong) offset); } block = free_block; @@ -1468,7 +1740,7 @@ buf_page_create( buf_LRU_add_block(block, FALSE); #ifdef UNIV_SYNC_DEBUG - buf_block_buf_fix_inc_debug(block, IB__FILE__, __LINE__); + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); #else buf_block_buf_fix_inc(block); #endif @@ -1483,7 +1755,7 @@ buf_page_create( /* Delete possible entries for the page from the insert buffer: such can exist if the page belonged to an index which was dropped */ - ibuf_merge_or_delete_for_page(NULL, space, offset); + ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); @@ -1516,6 +1788,8 @@ buf_page_io_complete( ut_ad(block); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + io_type = block->io_fix; if (io_type == BUF_IO_READ) { @@ -1531,7 +1805,7 @@ buf_page_io_complete( fprintf(stderr, "InnoDB: Error: page n:o stored in the page read in is %lu, should be %lu!\n", - read_page_no, block->offset); + (ulong) read_page_no, (ulong) block->offset); } /* From version 3.23.38 up we store the page checksum to the 4 first bytes of the page end lsn field */ @@ -1539,7 +1813,7 @@ buf_page_io_complete( if (buf_page_is_corrupted(block->frame)) { fprintf(stderr, "InnoDB: Database page corruption on disk or a failed\n" - "InnoDB: file read of page %lu.\n", block->offset); + "InnoDB: file read of page %lu.\n", (ulong) block->offset); fputs( "InnoDB: You may have to recover from a backup.\n", stderr); @@ -1548,7 +1822,7 @@ buf_page_io_complete( fprintf(stderr, "InnoDB: Database page corruption on disk or a failed\n" - "InnoDB: file read of page %lu.\n", block->offset); + "InnoDB: file read of page %lu.\n", (ulong) block->offset); fputs( "InnoDB: You may have to recover from a backup.\n", stderr); fputs( @@ -1580,7 +1854,7 @@ buf_page_io_complete( if (!recv_no_ibuf_operations) { ibuf_merge_or_delete_for_page(block->frame, - block->space, block->offset); + block->space, block->offset, TRUE); } } @@ -1605,9 +1879,7 @@ buf_page_io_complete( buf_pool->n_pend_reads--; buf_pool->n_pages_read++; - rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); - rw_lock_x_unlock_gen(&(block->read_lock), BUF_IO_READ); if (buf_debug_prints) { fputs("Has read ", stderr); @@ -1633,7 +1905,7 @@ buf_page_io_complete( if (buf_debug_prints) { fprintf(stderr, "page space %lu page no %lu\n", - block->space, block->offset); + (ulong) block->space, (ulong) block->offset); } } @@ -1734,14 +2006,14 @@ buf_validate(void) } if (n_lru + n_free > buf_pool->curr_size) { - fprintf(stderr, "n LRU %lu, n free %lu\n", n_lru, n_free); + fprintf(stderr, "n LRU %lu, n free %lu\n", (ulong) n_lru, (ulong) n_free); ut_error; } ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { fprintf(stderr, "Free list len %lu, free blocks %lu\n", - UT_LIST_GET_LEN(buf_pool->free), n_free); + (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free); ut_error; } ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); @@ -1777,7 +2049,7 @@ buf_print(void) ut_ad(buf_pool); - size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + size = buf_pool->curr_size; index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); @@ -1785,23 +2057,23 @@ buf_print(void) mutex_enter(&(buf_pool->mutex)); fprintf(stderr, - "buf_pool size %lu \n" - "database pages %lu \n" - "free pages %lu \n" - "modified database pages %lu \n" - "n pending reads %lu \n" + "buf_pool size %lu\n" + "database pages %lu\n" + "free pages %lu\n" + "modified database pages %lu\n" + "n pending reads %lu\n" "n pending flush LRU %lu list %lu single page %lu\n" "pages read %lu, created %lu, written %lu\n", - size, - UT_LIST_GET_LEN(buf_pool->LRU), - UT_LIST_GET_LEN(buf_pool->free), - UT_LIST_GET_LEN(buf_pool->flush_list), - buf_pool->n_pend_reads, - buf_pool->n_flush[BUF_FLUSH_LRU], - buf_pool->n_flush[BUF_FLUSH_LIST], - buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE], - buf_pool->n_pages_read, buf_pool->n_pages_created, - buf_pool->n_pages_written); + (ulong) size, + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE], + (ulong) buf_pool->n_pages_read, buf_pool->n_pages_created, + (ulong) buf_pool->n_pages_written); /* Count the number of blocks belonging to each index in the buffer */ @@ -1846,7 +2118,8 @@ buf_print(void) fprintf(stderr, "Block count for index %lu in buffer is about %lu", - ut_dulint_get_low(index_ids[i]), counts[i]); + (ulong) ut_dulint_get_low(index_ids[i]), + (ulong) counts[i]); if (index) { putc(' ', stderr); @@ -1911,27 +2184,36 @@ buf_print_io( ulint size; ut_ad(buf_pool); - size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + size = buf_pool->curr_size; mutex_enter(&(buf_pool->mutex)); + if (srv_use_awe) { + fprintf(stderr, + "AWE: Buffer pool memory frames %lu\n", + (ulong) buf_pool->n_frames); + + fprintf(stderr, + "AWE: Database pages and free buffers mapped in frames %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } fprintf(file, "Buffer pool size %lu\n" "Free buffers %lu\n" "Database pages %lu\n" "Modified db pages %lu\n" - "Pending reads %lu \n" + "Pending reads %lu\n" "Pending writes: LRU %lu, flush list %lu, single page %lu\n", - size, - UT_LIST_GET_LEN(buf_pool->free), - UT_LIST_GET_LEN(buf_pool->LRU), - UT_LIST_GET_LEN(buf_pool->flush_list), - buf_pool->n_pend_reads, - buf_pool->n_flush[BUF_FLUSH_LRU] - + buf_pool->init_flush[BUF_FLUSH_LRU], - buf_pool->n_flush[BUF_FLUSH_LIST] - + buf_pool->init_flush[BUF_FLUSH_LIST], - buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + (ulong) size, + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->init_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->init_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, @@ -1941,8 +2223,9 @@ buf_print_io( fprintf(file, "Pages read %lu, created %lu, written %lu\n" "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", - buf_pool->n_pages_read, buf_pool->n_pages_created, - buf_pool->n_pages_written, + (ulong) buf_pool->n_pages_read, + (ulong) buf_pool->n_pages_created, + (ulong) buf_pool->n_pages_written, (buf_pool->n_pages_read - buf_pool->n_pages_read_old) / time_elapsed, (buf_pool->n_pages_created - buf_pool->n_pages_created_old) @@ -1950,12 +2233,19 @@ buf_print_io( (buf_pool->n_pages_written - buf_pool->n_pages_written_old) / time_elapsed); + if (srv_use_awe) { + fprintf(file, "AWE: %.2f page remaps/s\n", + (buf_pool->n_pages_awe_remapped + - buf_pool->n_pages_awe_remapped_old) + / time_elapsed); + } + if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) { fprintf(file, "Buffer pool hit rate %lu / 1000\n", - 1000 + (ulong) (1000 - ((1000 * (buf_pool->n_pages_read - buf_pool->n_pages_read_old)) - / (buf_pool->n_page_gets - buf_pool->n_page_gets_old))); + / (buf_pool->n_page_gets - buf_pool->n_page_gets_old)))); } else { fputs("No buffer pool page gets since the last printout\n", file); @@ -1965,6 +2255,7 @@ buf_print_io( buf_pool->n_pages_read_old = buf_pool->n_pages_read; buf_pool->n_pages_created_old = buf_pool->n_pages_created; buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; mutex_exit(&(buf_pool->mutex)); } @@ -1981,6 +2272,7 @@ buf_refresh_io_stats(void) buf_pool->n_pages_read_old = buf_pool->n_pages_read; buf_pool->n_pages_created_old = buf_pool->n_pages_created; buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; } /************************************************************************* @@ -2007,7 +2299,7 @@ buf_all_freed(void) fprintf(stderr, "Page %lu %lu still fixed or dirty\n", - block->space, block->offset); + (ulong) block->space, (ulong) block->offset); ut_error; } } diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 7456e5d6f61..6cefdb60956 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -24,6 +24,7 @@ Created 11/11/1995 Heikki Tuuri #include "log0log.h" #include "os0file.h" #include "trx0sys.h" +#include "srv0srv.h" /* When flushed, dirty blocks are searched in neigborhoods of this size, and flushed along with the original page. */ @@ -51,6 +52,8 @@ buf_flush_insert_into_flush_list( ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) || (ut_dulint_cmp( (UT_LIST_GET_FIRST(buf_pool->flush_list)) @@ -107,7 +110,7 @@ buf_flush_ready_for_replace( /*========================*/ /* out: TRUE if can replace immediately */ buf_block_t* block) /* in: buffer control block, must be in state - BUF_BLOCK_FILE_PAGE and in the LRU list*/ + BUF_BLOCK_FILE_PAGE and in the LRU list */ { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); @@ -138,11 +141,10 @@ buf_flush_ready_for_flush( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) && (block->io_fix == 0)) { - if (flush_type != BUF_FLUSH_LRU) { return(TRUE); @@ -172,6 +174,8 @@ buf_flush_write_complete( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); @@ -251,7 +255,7 @@ buf_flush_buffered_writes(void) "InnoDB: to be written to data file. We intentionally crash server\n" "InnoDB: to prevent corrupt data from ending up in data\n" "InnoDB: files.\n", - block->offset, block->space); + (ulong) block->offset, (ulong) block->space); ut_error; } @@ -291,6 +295,8 @@ buf_flush_buffered_writes(void) for (i = 0; i < trx_doublewrite->first_free; i++) { block = trx_doublewrite->buf_block_arr[i]; + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, (void*)block->frame, (void*)block); @@ -330,6 +336,8 @@ buf_flush_post_to_doublewrite_buf( try_again: mutex_enter(&(trx_doublewrite->mutex)); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + if (trx_doublewrite->first_free >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { mutex_exit(&(trx_doublewrite->mutex)); @@ -370,16 +378,15 @@ buf_flush_init_for_writing( ulint space, /* in: space id */ ulint page_no) /* in: page number */ { - UT_NOT_USED(space); - /* Write the newest modification lsn to the page header and trailer */ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, newest_lsn); - /* Write the page number */ + /* Write the page number and the space id */ mach_write_to_4(page + FIL_PAGE_OFFSET, page_no); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space); /* Store the new formula checksum */ @@ -405,6 +412,8 @@ buf_flush_write_block_low( /*======================*/ buf_block_t* block) /* in: buffer block to write */ { + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif @@ -453,12 +462,26 @@ buf_flush_try_page( block = buf_page_hash_get(space, offset); - ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); if (flush_type == BUF_FLUSH_LIST && block && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; + + /* If AWE is enabled and the page is not mapped to a frame, + then map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is + in the LRU list and we must put it to + awe_LRU_free_mapped list once mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + block->flush_type = flush_type; if (buf_pool->n_flush[flush_type] == 0) { @@ -491,7 +514,7 @@ buf_flush_try_page( if (buf_debug_prints) { fprintf(stderr, "Flushing page space %lu, page no %lu \n", - block->space, block->offset); + (ulong) block->space, (ulong) block->offset); } buf_flush_write_block_low(block); @@ -510,6 +533,20 @@ buf_flush_try_page( ..._ready_for_flush). */ block->io_fix = BUF_IO_WRITE; + + /* If AWE is enabled and the page is not mapped to a frame, + then map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is + in the LRU list and we must put it to + awe_LRU_free_mapped list once mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + block->flush_type = flush_type; if (buf_pool->n_flush[flush_type] == 0) { @@ -535,6 +572,20 @@ buf_flush_try_page( && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; + + /* If AWE is enabled and the page is not mapped to a frame, + then map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is + in the LRU list and we must put it to + awe_LRU_free_mapped list once mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + block->flush_type = flush_type; if (buf_pool->n_flush[block->flush_type] == 0) { @@ -551,7 +602,8 @@ buf_flush_try_page( if (buf_debug_prints) { fprintf(stderr, "Flushing single page space %lu, page no %lu \n", - block->space, block->offset); + (ulong) block->space, + (ulong) block->offset); } buf_flush_write_block_low(block); @@ -604,6 +656,7 @@ buf_flush_try_neighbors( for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); + ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); if (block && flush_type == BUF_FLUSH_LRU && i != offset && !block->old) { @@ -672,10 +725,10 @@ buf_flush_batch( ulint offset; ibool found; - ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); - ut_ad((flush_type != BUF_FLUSH_LIST) || - sync_thread_levels_empty_gen(TRUE)); - + ut_ad((flush_type == BUF_FLUSH_LRU) + || (flush_type == BUF_FLUSH_LIST)); + ut_ad((flush_type != BUF_FLUSH_LIST) + || sync_thread_levels_empty_gen(TRUE)); mutex_enter(&(buf_pool->mutex)); if ((buf_pool->n_flush[flush_type] > 0) @@ -706,7 +759,6 @@ buf_flush_batch( ut_ad(flush_type == BUF_FLUSH_LIST); block = UT_LIST_GET_LAST(buf_pool->flush_list); - if (!block || (ut_dulint_cmp(block->oldest_modification, lsn_limit) >= 0)) { @@ -725,6 +777,7 @@ buf_flush_batch( function a pointer to a block in the list! */ while ((block != NULL) && !found) { + ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (buf_flush_ready_for_flush(block, flush_type)) { @@ -750,7 +803,6 @@ buf_flush_batch( } else if (flush_type == BUF_FLUSH_LRU) { block = UT_LIST_GET_PREV(LRU, block); - } else { ut_ad(flush_type == BUF_FLUSH_LIST); @@ -785,7 +837,7 @@ buf_flush_batch( fprintf(stderr, flush_type == BUF_FLUSH_LRU ? "Flushed %lu pages in LRU flush\n" : "Flushed %lu pages in flush list flush\n", - page_count); + (ulong) page_count); } return(page_count); diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index 0ced7e23abe..796311f0157 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -62,6 +62,91 @@ buf_LRU_block_free_hashed_page( be in a state where it can be freed */ /********************************************************************** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. */ + +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id) /* in: space id */ +{ + buf_block_t* block; + ulint page_no; + ibool all_freed; + +scan_again: + mutex_enter(&(buf_pool->mutex)); + + all_freed = TRUE; + + block = UT_LIST_GET_LAST(buf_pool->LRU); + + while (block != NULL) { + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + + if (block->space == id + && (block->buf_fix_count > 0 || block->io_fix != 0)) { + + /* We cannot remove this page during this scan yet; + maybe the system is currently reading it in, or + flushing the modifications to the file */ + + all_freed = FALSE; + + goto next_page; + } + + if (block->space == id) { +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + printf( + "Dropping space %lu page %lu\n", + (ulong) block->space, + (ulong) block->offset); + } +#endif + if (block->is_hashed) { + page_no = block->offset; + + mutex_exit(&(buf_pool->mutex)); + + /* Note that the following call will acquire + an S-latch on the page */ + + btr_search_drop_page_hash_when_freed(id, + page_no); + goto scan_again; + } + + if (0 != ut_dulint_cmp(block->oldest_modification, + ut_dulint_zero)) { + + /* Remove from the flush list of modified + blocks */ + block->oldest_modification = ut_dulint_zero; + + UT_LIST_REMOVE(flush_list, + buf_pool->flush_list, block); + } + + /* Remove from the LRU list */ + buf_LRU_block_remove_hashed_page(block); + buf_LRU_block_free_hashed_page(block); + } +next_page: + block = UT_LIST_GET_PREV(LRU, block); + } + + mutex_exit(&(buf_pool->mutex)); + + if (!all_freed) { + os_thread_sleep(20000); + + goto scan_again; + } +} + +/********************************************************************** Gets the minimum LRU_position field for the blocks in an initial segment (determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not guaranteed to be precise, because the ulint_clock may wrap around. */ @@ -118,43 +203,45 @@ buf_LRU_search_and_free_block( mutex_enter(&(buf_pool->mutex)); freed = FALSE; - block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { - + ut_a(block->in_LRU_list); if (buf_flush_ready_for_replace(block)) { if (buf_debug_prints) { fprintf(stderr, "Putting space %lu page %lu to free list\n", - block->space, block->offset); + (ulong) block->space, + (ulong) block->offset); } - + buf_LRU_block_remove_hashed_page(block); mutex_exit(&(buf_pool->mutex)); - btr_search_drop_page_hash_index(block->frame); - + /* Remove possible adaptive hash index built on the + page; in the case of AWE the block may not have a + frame at all */ + + if (block->frame) { + btr_search_drop_page_hash_index(block->frame); + } mutex_enter(&(buf_pool->mutex)); ut_a(block->buf_fix_count == 0); buf_LRU_block_free_hashed_page(block); - freed = TRUE; break; } - block = UT_LIST_GET_PREV(LRU, block); distance++; if (!freed && n_iterations <= 10 && distance > 100 + (n_iterations * buf_pool->curr_size) / 10) { - buf_pool->LRU_flush_ended = 0; mutex_exit(&(buf_pool->mutex)); @@ -162,15 +249,12 @@ buf_LRU_search_and_free_block( return(FALSE); } } - if (buf_pool->LRU_flush_ended > 0) { buf_pool->LRU_flush_ended--; } - - if (!freed) { + if (!freed) { buf_pool->LRU_flush_ended = 0; } - mutex_exit(&(buf_pool->mutex)); return(freed); @@ -211,7 +295,9 @@ list. */ buf_block_t* buf_LRU_get_free_block(void) /*========================*/ - /* out: the free control block */ + /* out: the free control block; also if AWE is + used, it is guaranteed that the block has its + page mapped to a frame when we return */ { buf_block_t* block = NULL; ibool freed; @@ -254,7 +340,7 @@ loop: "InnoDB: the buffer pool bigger?\n" "InnoDB: Starting the InnoDB Monitor to print diagnostics, including\n" "InnoDB: lock heap and hash index sizes.\n", - (ulong)(buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); + (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); @@ -273,7 +359,27 @@ loop: if (UT_LIST_GET_LEN(buf_pool->free) > 0) { block = UT_LIST_GET_FIRST(buf_pool->free); + ut_a(block->in_free_list); UT_LIST_REMOVE(free, buf_pool->free, block); + block->in_free_list = FALSE; + ut_a(block->state != BUF_BLOCK_FILE_PAGE); + ut_a(!block->in_LRU_list); + + if (srv_use_awe) { + if (block->frame) { + /* Remove from the list of mapped pages */ + + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } else { + /* We map the page to a frame; second param + FALSE below because we do not want it to be + added to the awe_LRU_free_mapped list */ + + buf_awe_map_page_to_frame(block, FALSE); + } + } + block->state = BUF_BLOCK_READY_FOR_USE; mutex_exit(&(buf_pool->mutex)); @@ -311,10 +417,11 @@ loop: "InnoDB: %lu OS file reads, %lu OS file writes, %lu OS fsyncs\n" "InnoDB: Starting InnoDB Monitor to print further\n" "InnoDB: diagnostics to the standard output.\n", - n_iterations, - fil_n_pending_log_flushes, - fil_n_pending_tablespace_flushes, - os_n_file_reads, os_n_file_writes, os_n_fsyncs); + (ulong) n_iterations, + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); mon_value_was = srv_print_innodb_monitor; started_monitor = TRUE; @@ -363,7 +470,7 @@ buf_LRU_old_adjust_len(void) ulint old_len; ulint new_len; - ut_ad(buf_pool->LRU_old); + ut_a(buf_pool->LRU_old); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ @@ -373,6 +480,8 @@ buf_LRU_old_adjust_len(void) old_len = buf_pool->LRU_old_len; new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + ut_a(buf_pool->LRU_old->in_LRU_list); + /* Update the LRU_old pointer if necessary */ if (old_len < new_len - BUF_LRU_OLD_TOLERANCE) { @@ -389,7 +498,7 @@ buf_LRU_old_adjust_len(void) buf_pool->LRU_old); buf_pool->LRU_old_len--; } else { - ut_ad(buf_pool->LRU_old); /* Check that we did not + ut_a(buf_pool->LRU_old); /* Check that we did not fall out of the LRU list */ return; } @@ -397,9 +506,8 @@ buf_LRU_old_adjust_len(void) } /*********************************************************************** -Initializes the old blocks pointer in the LRU list. -This function should be called when the LRU list grows to -BUF_LRU_OLD_MIN_LEN length. */ +Initializes the old blocks pointer in the LRU list. This function should be +called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ static void buf_LRU_old_init(void) @@ -407,7 +515,7 @@ buf_LRU_old_init(void) { buf_block_t* block; - ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); /* We first initialize all blocks in the LRU list as old and then use the adjust function to move the LRU_old pointer to the right @@ -416,6 +524,8 @@ buf_LRU_old_init(void) block = UT_LIST_GET_FIRST(buf_pool->LRU); while (block != NULL) { + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->in_LRU_list); block->old = TRUE; block = UT_LIST_GET_NEXT(LRU, block); } @@ -440,6 +550,9 @@ buf_LRU_remove_block( ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->in_LRU_list); + /* If the LRU_old pointer is defined and points to just this block, move it backward one step */ @@ -453,11 +566,19 @@ buf_LRU_remove_block( (buf_pool->LRU_old)->old = TRUE; buf_pool->LRU_old_len++; - ut_ad(buf_pool->LRU_old); + ut_a(buf_pool->LRU_old); } /* Remove the block from the LRU list */ UT_LIST_REMOVE(LRU, buf_pool->LRU, block); + block->in_LRU_list = FALSE; + + if (srv_use_awe && block->frame) { + /* Remove from the list of mapped pages */ + + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } /* If the LRU list is so short that LRU_old not defined, return */ if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { @@ -495,6 +616,8 @@ buf_LRU_add_block_to_end_low( ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + block->old = TRUE; last_block = UT_LIST_GET_LAST(buf_pool->LRU); @@ -505,8 +628,17 @@ buf_LRU_add_block_to_end_low( block->LRU_position = buf_pool_clock_tic(); } + ut_a(!block->in_LRU_list); UT_LIST_ADD_LAST(LRU, buf_pool->LRU, block); + block->in_LRU_list = TRUE; + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages */ + + UT_LIST_ADD_LAST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { buf_pool->LRU_old_len++; @@ -549,9 +681,21 @@ buf_LRU_add_block_low( ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(!block->in_LRU_list); + block->old = old; cl = buf_pool_clock_tic(); + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages; for simplicity we always + add to the start, even if the user would have set 'old' + TRUE */ + + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) { UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block); @@ -569,6 +713,8 @@ buf_LRU_add_block_low( block->LRU_position = (buf_pool->LRU_old)->LRU_position; } + block->in_LRU_list = TRUE; + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { ut_ad(buf_pool->LRU_old); @@ -639,9 +785,12 @@ buf_LRU_block_free_non_file_page( #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); - ut_ad((block->state == BUF_BLOCK_MEMORY) + ut_a((block->state == BUF_BLOCK_MEMORY) || (block->state == BUF_BLOCK_READY_FOR_USE)); + ut_a(block->n_pointers == 0); + ut_a(!block->in_free_list); + block->state = BUF_BLOCK_NOT_USED; #ifdef UNIV_DEBUG @@ -649,6 +798,14 @@ buf_LRU_block_free_non_file_page( memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif UT_LIST_ADD_FIRST(free, buf_pool->free, block); + block->in_free_list = TRUE; + + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages */ + + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } } /********************************************************************** @@ -667,8 +824,7 @@ buf_LRU_block_remove_hashed_page( #endif /* UNIV_SYNC_DEBUG */ ut_ad(block); - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); - + ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(block->io_fix == 0); ut_a(block->buf_fix_count == 0); ut_a(ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) == 0); @@ -677,7 +833,32 @@ buf_LRU_block_remove_hashed_page( buf_pool->freed_page_clock += 1; - buf_frame_modify_clock_inc(block->frame); + /* Note that if AWE is enabled the block may not have a frame at all */ + + buf_block_modify_clock_inc(block); + + if (block != buf_page_hash_get(block->space, block->offset)) { + fprintf(stderr, +"InnoDB: Error: page %lu %lu not found from the hash table\n", + (ulong) block->space, + (ulong) block->offset); + if (buf_page_hash_get(block->space, block->offset)) { + fprintf(stderr, +"InnoDB: From hash table we find block %lx of %lu %lu which is not %lx\n", + (ulong) buf_page_hash_get(block->space, block->offset), + (ulong) buf_page_hash_get(block->space, block->offset)->space, + (ulong) buf_page_hash_get(block->space, block->offset)->offset, + (ulong) block); + } + +#ifdef UNIV_DEBUG + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif + ut_a(0); + } HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, buf_page_address_fold(block->space, block->offset), @@ -698,7 +879,7 @@ buf_LRU_block_free_hashed_page( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(block->state == BUF_BLOCK_REMOVE_HASH); + ut_a(block->state == BUF_BLOCK_REMOVE_HASH); block->state = BUF_BLOCK_MEMORY; @@ -791,7 +972,7 @@ buf_LRU_print(void) ut_ad(buf_pool); mutex_enter(&(buf_pool->mutex)); - fprintf(stderr, "Pool ulint clock %lu\n", buf_pool->ulint_clock); + fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); block = UT_LIST_GET_FIRST(buf_pool->LRU); @@ -799,7 +980,7 @@ buf_LRU_print(void) while (block != NULL) { - fprintf(stderr, "BLOCK %lu ", block->offset); + fprintf(stderr, "BLOCK %lu ", (ulong) block->offset); if (block->old) { fputs("old ", stderr); @@ -807,11 +988,11 @@ buf_LRU_print(void) if (block->buf_fix_count) { fprintf(stderr, "buffix count %lu ", - block->buf_fix_count); + (ulong) block->buf_fix_count); } if (block->io_fix) { - fprintf(stderr, "io_fix %lu ", block->io_fix); + fprintf(stderr, "io_fix %lu ", (ulong) block->io_fix); } if (ut_dulint_cmp(block->oldest_modification, @@ -822,9 +1003,9 @@ buf_LRU_print(void) frame = buf_block_get_frame(block); fprintf(stderr, "LRU pos %lu type %lu index id %lu ", - block->LRU_position, - fil_page_get_type(frame), - ut_dulint_get_low(btr_page_get_index_id(frame))); + (ulong) block->LRU_position, + (ulong) fil_page_get_type(frame), + (ulong) ut_dulint_get_low(btr_page_get_index_id(frame))); block = UT_LIST_GET_NEXT(LRU, block); if (++len == 10) { diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 83397c9c7fa..71e885ff439 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -49,19 +49,30 @@ ulint buf_read_page_low( /*==============*/ /* out: 1 if a read request was queued, 0 if the page - already resided in buf_pool or if the page is in + already resided in buf_pool, or if the page is in the doublewrite buffer blocks in which case it is never - read into the pool */ + read into the pool, or if the tablespace does not + exist or is being dropped */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are + trying to read from a non-existent tablespace, or a + tablespace which is just now being dropped */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ..., ORed to OS_AIO_SIMULATED_WAKE_LATER (see below at read-ahead functions) */ ulint space, /* in: space id */ + ib_longlong tablespace_version, /* in: if the space memory object has + this timestamp different from what we are giving here, + treat the tablespace as dropped; this is a timestamp we + use to stop dangling page reads from a tablespace + which we have DISCARDed + IMPORTed back */ ulint offset) /* in: page number */ { buf_block_t* block; ulint wake_later; + *err = DB_SUCCESS; + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; @@ -72,6 +83,11 @@ buf_read_page_low( || (offset >= trx_doublewrite->block2 && offset < trx_doublewrite->block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to read doublewrite buffer page %lu\n", + (ulong) offset); + return(0); } @@ -98,28 +114,41 @@ buf_read_page_low( sync = TRUE; } - block = buf_page_init_for_read(mode, space, offset); - - if (block != NULL) { - if (buf_debug_prints) { - fprintf(stderr, - "Posting read request for page %lu, sync %lu\n", - offset, sync); - } - - fil_io(OS_FILE_READ | wake_later, - sync, space, offset, 0, UNIV_PAGE_SIZE, - (void*)block->frame, (void*)block); - if (sync) { - /* The i/o is already completed when we arrive from - fil_read */ - buf_page_io_complete(block); - } + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ + block = buf_page_init_for_read(err, mode, space, tablespace_version, + offset); + if (block == NULL) { - return(1); + return(0); + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Posting read request for page %lu, sync %lu\n", + (ulong) offset, + (ulong) sync); } +#endif + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); - return(0); + *err = fil_io(OS_FILE_READ | wake_later, + sync, space, + offset, 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); + ut_a(*err == DB_SUCCESS); + + if (sync) { + /* The i/o is already completed when we arrive from + fil_read */ + buf_page_io_complete(block); + } + + return(1); } /************************************************************************ @@ -144,12 +173,14 @@ buf_read_ahead_random( ulint offset) /* in: page number of a page which the current thread wants to access */ { + ib_longlong tablespace_version; buf_block_t* block; ulint recent_blocks = 0; ulint count; ulint LRU_recent_limit; ulint ibuf_mode; ulint low, high; + ulint err; ulint i; if (srv_startup_is_before_trx_rollback_phase) { @@ -166,11 +197,16 @@ buf_read_ahead_random( return(0); } + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + low = (offset / BUF_READ_AHEAD_RANDOM_AREA) * BUF_READ_AHEAD_RANDOM_AREA; high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1) * BUF_READ_AHEAD_RANDOM_AREA; - if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); @@ -195,7 +231,6 @@ buf_read_ahead_random( that is, reside near the start of the LRU list. */ for (i = low; i < high; i++) { - block = buf_page_hash_get(space, i); if ((block) @@ -229,10 +264,17 @@ buf_read_ahead_random( mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(i)) { - - count += buf_read_page_low(FALSE, ibuf_mode + count += buf_read_page_low(&err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, - space, i); + space, tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: in random readahead trying to access tablespace\n" +"InnoDB: %lu page no. %lu,\n" +"InnoDB: but the tablespace does not exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } } } @@ -245,7 +287,8 @@ buf_read_ahead_random( if (buf_debug_prints && (count > 0)) { fprintf(stderr, "Random read-ahead space %lu offset %lu pages %lu\n", - space, offset, count); + (ulong) space, (ulong) offset, + (ulong) count); } return(count); @@ -266,15 +309,27 @@ buf_read_page( ulint space, /* in: space id */ ulint offset) /* in: page number */ { - ulint count; - ulint count2; + ib_longlong tablespace_version; + ulint count; + ulint count2; + ulint err; + + tablespace_version = fil_space_get_version(space); count = buf_read_ahead_random(space, offset); /* We do the i/o in the synchronous aio mode to save thread switches: hence TRUE */ - count2 = buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, offset); + count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + tablespace_version, offset); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: error: trying to access tablespace %lu page no. %lu,\n" +"InnoDB: but the tablespace does not exist or is just being dropped.\n", + (ulong) space, (ulong) offset); + } /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); @@ -314,6 +369,7 @@ buf_read_ahead_linear( ulint offset) /* in: page number of a page; NOTE: the current thread must want access to this page (see NOTE 3 above) */ { + ib_longlong tablespace_version; buf_block_t* block; buf_frame_t* frame; buf_block_t* pred_block = NULL; @@ -325,6 +381,7 @@ buf_read_ahead_linear( ulint fail_count; ulint ibuf_mode; ulint low, high; + ulint err; ulint i; if (srv_startup_is_before_trx_rollback_phase) { @@ -352,14 +409,21 @@ buf_read_ahead_linear( return(0); } + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + mutex_enter(&(buf_pool->mutex)); + if (high > fil_space_get_size(space)) { + mutex_exit(&(buf_pool->mutex)); /* The area is not whole, return */ return(0); } - mutex_enter(&(buf_pool->mutex)); - if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { mutex_exit(&(buf_pool->mutex)); @@ -380,18 +444,15 @@ buf_read_ahead_linear( fail_count = 0; for (i = low; i < high; i++) { - block = buf_page_hash_get(space, i); if ((block == NULL) || !block->accessed) { - /* Not accessed */ fail_count++; } else if (pred_block && (ut_ulint_cmp(block->LRU_position, pred_block->LRU_position) != asc_or_desc)) { - /* Accesses not in the right order */ fail_count++; @@ -464,7 +525,7 @@ buf_read_ahead_linear( return(0); } - /* If we got this far, read-ahead can be sensible: do it */ + /* If we got this far, read-ahead can be sensible: do it */ if (ibuf_inside()) { ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; @@ -485,9 +546,17 @@ buf_read_ahead_linear( aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(i)) { - count += buf_read_page_low(FALSE, ibuf_mode + count += buf_read_page_low(&err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, - space, i); + space, tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: in linear readahead trying to access tablespace\n" +"InnoDB: %lu page no. %lu,\n" +"InnoDB: but the tablespace does not exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } } } @@ -503,7 +572,7 @@ buf_read_ahead_linear( if (buf_debug_prints && (count > 0)) { fprintf(stderr, "LINEAR read-ahead space %lu offset %lu pages %lu\n", - space, offset, count); + (ulong) space, (ulong) offset, (ulong) count); } return(count); @@ -511,7 +580,7 @@ buf_read_ahead_linear( /************************************************************************ Issues read requests for pages which the ibuf module wants to read in, in -order to contract insert buffer trees. Technically, this function is like +order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ void @@ -520,11 +589,17 @@ buf_read_ibuf_merge_pages( ibool sync, /* in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ - ulint space, /* in: space id */ + ulint* space_ids, /* in: array of space ids */ + ib_longlong* space_versions,/* in: the spaces must have this version + number (timestamp), otherwise we discard the + read; we use this to cancel reads if + DISCARD + IMPORT may have changed the + tablespace size */ ulint* page_nos, /* in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /* in: number of page numbers in the array */ { + ulint err; ulint i; ut_ad(!ibuf_inside()); @@ -538,11 +613,19 @@ buf_read_ibuf_merge_pages( for (i = 0; i < n_stored; i++) { if ((i + 1 == n_stored) && sync) { - buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, - page_nos[i]); + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, + space_ids[i], space_versions[i], page_nos[i]); } else { - buf_read_page_low(FALSE, BUF_READ_ANY_PAGE, space, - page_nos[i]); + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE, + space_ids[i], space_versions[i], page_nos[i]); + } + + if (err == DB_TABLESPACE_DELETED) { + /* We have deleted or are deleting the single-table + tablespace: remove the entries for that page */ + + ibuf_merge_or_delete_for_page(NULL, space_ids[i], + page_nos[i], FALSE); } } @@ -552,7 +635,7 @@ buf_read_ibuf_merge_pages( if (buf_debug_prints) { fprintf(stderr, "Ibuf merge read-ahead space %lu pages %lu\n", - space, n_stored); + (ulong) space_ids[0], (ulong) n_stored); } } @@ -570,8 +653,12 @@ buf_read_recv_pages( highest page number the last in the array */ ulint n_stored) /* in: number of page numbers in the array */ { - ulint count; - ulint i; + ib_longlong tablespace_version; + ulint count; + ulint err; + ulint i; + + tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { @@ -579,7 +666,7 @@ buf_read_recv_pages( os_aio_print_debug = FALSE; - while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) { + while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(500000); @@ -590,7 +677,7 @@ buf_read_recv_pages( fprintf(stderr, "InnoDB: Error: InnoDB has waited for 50 seconds for pending\n" "InnoDB: reads to the buffer pool to be finished.\n" -"InnoDB: Number of pending reads %lu\n", buf_pool->n_pend_reads); +"InnoDB: Number of pending reads %lu\n", (ulong) buf_pool->n_pend_reads); os_aio_print_debug = TRUE; } @@ -599,12 +686,12 @@ buf_read_recv_pages( os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { - buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, - page_nos[i]); + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + tablespace_version, page_nos[i]); } else { - buf_read_page_low(FALSE, BUF_READ_ANY_PAGE + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, - space, page_nos[i]); + space, tablespace_version, page_nos[i]); } } @@ -615,6 +702,6 @@ buf_read_recv_pages( if (buf_debug_prints) { fprintf(stderr, - "Recovery applies read-ahead pages %lu\n", n_stored); + "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } } diff --git a/innobase/configure.in b/innobase/configure.in index 7b123d96cc2..652291f1f38 100644 --- a/innobase/configure.in +++ b/innobase/configure.in @@ -38,6 +38,7 @@ AC_PROG_LIBTOOL AC_CHECK_HEADERS(aio.h sched.h) AC_CHECK_SIZEOF(int, 4) AC_CHECK_SIZEOF(long, 4) +AC_CHECK_SIZEOF(void*, 4) AC_CHECK_FUNCS(sched_yield) AC_CHECK_FUNCS(fdatasync) #AC_CHECK_FUNCS(localtime_r) # Already checked by MySQL diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c index 11eb295f033..97ec1a1acd9 100644 --- a/innobase/data/data0data.c +++ b/innobase/data/data0data.c @@ -193,7 +193,8 @@ dfield_check_typed_no_assert( fprintf(stderr, "InnoDB: Error: data field type %lu, len %lu\n", - dfield_get_type(field)->mtype, dfield_get_len(field)); + (ulong) dfield_get_type(field)->mtype, + (ulong) dfield_get_len(field)); return(FALSE); } @@ -215,7 +216,7 @@ dtuple_check_typed_no_assert( if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) { fprintf(stderr, "InnoDB: Error: index entry has %lu fields\n", - dtuple_get_n_fields(tuple)); + (ulong) dtuple_get_n_fields(tuple)); dump: fputs("InnoDB: Tuple contents: ", stderr); dtuple_print(stderr, tuple); @@ -250,7 +251,8 @@ dfield_check_typed( fprintf(stderr, "InnoDB: Error: data field type %lu, len %lu\n", - dfield_get_type(field)->mtype, dfield_get_len(field)); + (ulong) dfield_get_type(field)->mtype, + (ulong) dfield_get_len(field)); ut_error; } @@ -444,10 +446,10 @@ dtuple_print( n_fields = dtuple_get_n_fields(tuple); - fprintf(f, "DATA TUPLE: %lu fields;\n", n_fields); + fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields); for (i = 0; i < n_fields; i++) { - fprintf(f, " %lu:", i); + fprintf(f, " %lu:", (ulong) i); field = dtuple_get_nth_field(tuple, i); @@ -502,7 +504,7 @@ dtuple_convert_big_rec( if (size > 1000000000) { fprintf(stderr, -"InnoDB: Warning: tuple size very big: %lu\n", size); +"InnoDB: Warning: tuple size very big: %lu\n", (ulong) size); fputs("InnoDB: Tuple contents: ", stderr); dtuple_print(stderr, entry); putc('\n', stderr); diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c index 077012553ba..97d93b1b0ec 100644 --- a/innobase/data/data0type.c +++ b/innobase/data/data0type.c @@ -12,10 +12,99 @@ Created 1/16/1996 Heikki Tuuri #include "data0type.ic" #endif +/* At the database startup we store the default-charset collation number of +this MySQL installation to this global variable. If we have < 4.1.2 format +column definitions, or records in the insert buffer, we use this +charset-collation code for them. */ + +ulint data_mysql_default_charset_coll = 99999999; +ulint data_mysql_latin1_swedish_charset_coll = 99999999; + dtype_t dtype_binary_val = {DATA_BINARY, 0, 0, 0}; dtype_t* dtype_binary = &dtype_binary_val; /************************************************************************* +Checks if a data main type is a string type. Also a BLOB is considered a +string type. */ + +ibool +dtype_is_string_type( +/*=================*/ + /* out: TRUE if string type */ + ulint mtype) /* in: InnoDB main data type code: DATA_CHAR, ... */ +{ + if (mtype <= DATA_BLOB + || mtype == DATA_MYSQL + || mtype == DATA_VARMYSQL) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. */ + +ibool +dtype_is_binary_string_type( +/*========================*/ + /* out: TRUE if binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype) /* in: precise type */ +{ + if ((mtype == DATA_FIXBINARY) + || (mtype == DATA_BINARY) + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. */ + +ibool +dtype_is_non_binary_string_type( +/*============================*/ + /* out: TRUE if non-binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype) /* in: precise type */ +{ + if (dtype_is_string_type(mtype) == TRUE + && dtype_is_binary_string_type(mtype, prtype) == FALSE) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. */ + +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /* in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll) /* in: MySQL charset-collation code */ +{ + ut_a(old_prtype < 256 * 256); + ut_a(charset_coll < 256); + + return(old_prtype + (charset_coll << 16)); +} + +/************************************************************************* Validates a data type structure. */ ibool @@ -63,7 +152,7 @@ dtype_print( } else if (mtype == DATA_SYS) { fputs("DATA_SYS", stderr); } else { - fprintf(stderr, "type %lu", mtype); + fprintf(stderr, "type %lu", (ulong) mtype); } len = type->len; @@ -71,7 +160,7 @@ dtype_print( if ((type->mtype == DATA_SYS) || (type->mtype == DATA_VARCHAR) || (type->mtype == DATA_CHAR)) { - putc(' ', stderr); + putc(' ', stderr); if (prtype == DATA_ROW_ID) { fputs("DATA_ROW_ID", stderr); len = DATA_ROW_ID_LEN; @@ -86,9 +175,9 @@ dtype_print( } else if (prtype == DATA_ENGLISH) { fputs("DATA_ENGLISH", stderr); } else { - fprintf(stderr, "prtype %lu", mtype); + fprintf(stderr, "prtype %lu", (ulong) mtype); } } - fprintf(stderr, " len %lu prec %lu", len, type->prec); + fprintf(stderr, " len %lu prec %lu", (ulong) len, (ulong) type->prec); } diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index 1cae2750fbe..f156cf67a18 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -254,29 +254,26 @@ dict_boot(void) /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ - table = dict_mem_table_create((char *) "SYS_TABLES", DICT_HDR_SPACE,8); - - dict_mem_table_add_col(table, (char *) "NAME", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table, (char *) "ID", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table, (char *) "N_COLS", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "TYPE", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "MIX_ID", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table, (char *) "MIX_LEN", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "CLUSTER_NAME", DATA_BINARY, - 0, 0, 0); - dict_mem_table_add_col(table, (char *) "SPACE", DATA_INT, 0, 4, 0); + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE,8); + + dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "N_COLS", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "TYPE", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "MIX_ID", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "MIX_LEN", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "CLUSTER_NAME", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "SPACE", DATA_INT, 0, 4, 0); table->id = DICT_TABLES_ID; dict_table_add_to_cache(table); dict_sys->sys_tables = table; - index = dict_mem_index_create((char *) "SYS_TABLES", (char *) - "CLUST_IND", - DICT_HDR_SPACE, - DICT_UNIQUE | DICT_CLUSTERED, 1); + index = dict_mem_index_create("SYS_TABLES", "CLUST_IND", + DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 1); - dict_mem_index_add_field(index, (char *) "NAME", 0, 0); + dict_mem_index_add_field(index, "NAME", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLES, MLOG_4BYTES, &mtr); @@ -284,89 +281,89 @@ dict_boot(void) ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - index = dict_mem_index_create((char *) "SYS_TABLES", - (char *) "ID_IND", DICT_HDR_SPACE, - DICT_UNIQUE, 1); - dict_mem_index_add_field(index, (char *) "ID", 0, 0); + index = dict_mem_index_create("SYS_TABLES", "ID_IND", + DICT_HDR_SPACE, DICT_UNIQUE, 1); + dict_mem_index_add_field(index, "ID", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLE_IDS, MLOG_4BYTES, &mtr); index->id = DICT_TABLE_IDS_ID; ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - table = dict_mem_table_create((char *) "SYS_COLUMNS",DICT_HDR_SPACE,7); - - dict_mem_table_add_col(table, (char *) "TABLE_ID", DATA_BINARY,0,0,0); - dict_mem_table_add_col(table, (char *) "POS", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "NAME", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table, (char *) "MTYPE", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "PRTYPE", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "LEN", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "PREC", DATA_INT, 0, 4, 0); + table = dict_mem_table_create("SYS_COLUMNS",DICT_HDR_SPACE,7); + + dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY,0,0,0); + dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "MTYPE", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "PRTYPE", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "LEN", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "PREC", DATA_INT, 0, 4, 0); table->id = DICT_COLUMNS_ID; dict_table_add_to_cache(table); dict_sys->sys_columns = table; - index = dict_mem_index_create((char *) "SYS_COLUMNS", - (char *) "CLUST_IND", DICT_HDR_SPACE, - DICT_UNIQUE | DICT_CLUSTERED, 2); + index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND", + DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); - dict_mem_index_add_field(index, (char *) "TABLE_ID", 0, 0); - dict_mem_index_add_field(index, (char *) "POS", 0, 0); + dict_mem_index_add_field(index, "TABLE_ID", 0, 0); + dict_mem_index_add_field(index, "POS", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_COLUMNS, MLOG_4BYTES, &mtr); index->id = DICT_COLUMNS_ID; ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - table = dict_mem_table_create((char *) "SYS_INDEXES",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_INDEXES",DICT_HDR_SPACE,7); - dict_mem_table_add_col(table, (char *) "TABLE_ID", DATA_BINARY, 0,0,0); - dict_mem_table_add_col(table, (char *) "ID", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table, (char *) "NAME", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table, (char *) "N_FIELDS", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "TYPE", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "SPACE", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "PAGE_NO", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY, 0,0,0); + dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "N_FIELDS", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "TYPE", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "SPACE", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "PAGE_NO", DATA_INT, 0, 4, 0); /* The '+ 2' below comes from the 2 system fields */ - ut_ad(DICT_SYS_INDEXES_PAGE_NO_FIELD == 6 + 2); - ut_ad(DICT_SYS_INDEXES_SPACE_NO_FIELD == 5 + 2); +#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2 +#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2" +#endif +#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2 +#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2" +#endif table->id = DICT_INDEXES_ID; dict_table_add_to_cache(table); dict_sys->sys_indexes = table; - index = dict_mem_index_create((char *) "SYS_INDEXES", - (char *) "CLUST_IND", DICT_HDR_SPACE, - DICT_UNIQUE | DICT_CLUSTERED, 2); + index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND", + DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); - dict_mem_index_add_field(index, (char *) "TABLE_ID", 0, 0); - dict_mem_index_add_field(index, (char *) "ID", 0, 0); + dict_mem_index_add_field(index, "TABLE_ID", 0, 0); + dict_mem_index_add_field(index, "ID", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_INDEXES, MLOG_4BYTES, &mtr); index->id = DICT_INDEXES_ID; ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - table = dict_mem_table_create((char *) "SYS_FIELDS", DICT_HDR_SPACE,3); + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE,3); - dict_mem_table_add_col(table, (char *) "INDEX_ID", DATA_BINARY, 0,0,0); - dict_mem_table_add_col(table, (char *) "POS", DATA_INT, 0, 4, 0); - dict_mem_table_add_col(table, (char *) "COL_NAME", DATA_BINARY, 0,0,0); + dict_mem_table_add_col(table, "INDEX_ID", DATA_BINARY, 0,0,0); + dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); + dict_mem_table_add_col(table, "COL_NAME", DATA_BINARY, 0,0,0); table->id = DICT_FIELDS_ID; dict_table_add_to_cache(table); dict_sys->sys_fields = table; - index = dict_mem_index_create((char *) "SYS_FIELDS", - (char *) "CLUST_IND", DICT_HDR_SPACE, - DICT_UNIQUE | DICT_CLUSTERED, 2); + index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND", + DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); - dict_mem_index_add_field(index, (char *) "INDEX_ID", 0, 0); - dict_mem_index_add_field(index, (char *) "POS", 0, 0); + dict_mem_index_add_field(index, "INDEX_ID", 0, 0); + dict_mem_index_add_field(index, "POS", 0, 0); index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_FIELDS, MLOG_4BYTES, &mtr); @@ -419,6 +416,4 @@ dict_create(void) dict_boot(); dict_insert_initial_data(); - - sync_order_checks_on = TRUE; } diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index 6ebefc98a24..fd8e02585ae 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -203,6 +203,8 @@ dict_build_table_def_step( dict_table_t* table; dict_table_t* cluster_table; dtuple_t* row; + ulint error; + mtr_t mtr; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(dict_sys->mutex))); @@ -231,6 +233,32 @@ dict_build_table_def_step( table->mix_id = dict_hdr_get_new_id(DICT_HDR_MIX_ID); } + if (srv_file_per_table) { + /* We create a new single-table tablespace for the table. + We initially let it be 4 pages: + - page 0 is the fsp header and an extent descriptor page, + - page 1 is an ibuf bitmap page, + - page 2 is the first inode page, + - page 3 will contain the root of the clustered index of the + table we create here. */ + + table->space = 0; /* reset to zero for the call below */ + + error = fil_create_new_single_table_tablespace( + &(table->space), table->name, + FIL_IBD_FILE_INITIAL_SIZE); + if (error != DB_SUCCESS) { + + return(error); + } + + mtr_start(&mtr); + + fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); + + mtr_commit(&mtr); + } + row = dict_create_sys_tables_tuple(table, node->heap); ins_node_set_new_row(node->tab_def, row); @@ -424,8 +452,8 @@ dict_create_sys_fields_tuple( } /********************************************************************* -Creates the tuple with which the index entry is searched for -writing the index tree root page number, if such a tree is created. */ +Creates the tuple with which the index entry is searched for writing the index +tree root page number, if such a tree is created. */ static dtuple_t* dict_create_search_tuple( @@ -494,10 +522,10 @@ dict_build_index_def_step( index->id = dict_hdr_get_new_id(DICT_HDR_INDEX_ID); - if (index->type & DICT_CLUSTERED) { - /* Inherit the space from the table */ - index->space = table->space; - } + /* Inherit the space id from the table; we store all indexes of a + table in the same tablespace */ + + index->space = table->space; index->page_no = FIL_NULL; @@ -580,6 +608,9 @@ dict_create_index_tree_step( index->page_no = btr_create(index->type, index->space, index->id, &mtr); + /* printf("Created a new index tree in space %lu root page %lu\n", + index->space, index->page_no); */ + page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), DICT_SYS_INDEXES_PAGE_NO_FIELD, index->page_no, &mtr); @@ -630,7 +661,14 @@ dict_drop_index_tree( ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); - + + if (!fil_tablespace_exists_in_mem(space)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + return; + } + /* We free all the pages but the root page first; this operation may span several mini-transactions */ @@ -640,6 +678,8 @@ dict_drop_index_tree( we write FIL_NULL to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally freed */ + /* printf("Dropping index tree in space %lu root page %lu\n", space, + root_page_no); */ btr_free_root(space, root_page_no, mtr); page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, @@ -964,12 +1004,12 @@ dict_create_or_check_foreign_constraint_tables(void) que_t* graph; ulint error; trx_t* trx; - char* str; + const char* str; mutex_enter(&(dict_sys->mutex)); - table1 = dict_table_get_low((char *) "SYS_FOREIGN"); - table2 = dict_table_get_low((char *) "SYS_FOREIGN_COLS"); + table1 = dict_table_get_low("SYS_FOREIGN"); + table2 = dict_table_get_low("SYS_FOREIGN_COLS"); if (table1 && table2 && UT_LIST_GET_LEN(table1->indexes) == 3 @@ -987,20 +1027,20 @@ dict_create_or_check_foreign_constraint_tables(void) trx = trx_allocate_for_mysql(); - trx->op_info = (char *) "creating foreign key sys tables"; + trx->op_info = "creating foreign key sys tables"; row_mysql_lock_data_dictionary(trx); if (table1) { fprintf(stderr, "InnoDB: dropping incompletely created SYS_FOREIGN table\n"); - row_drop_table_for_mysql((char*)"SYS_FOREIGN", trx, TRUE); + row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); } if (table2) { fprintf(stderr, "InnoDB: dropping incompletely created SYS_FOREIGN_COLS table\n"); - row_drop_table_for_mysql((char*)"SYS_FOREIGN_COLS", trx, TRUE); + row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); } fprintf(stderr, @@ -1010,7 +1050,7 @@ dict_create_or_check_foreign_constraint_tables(void) there are 2 secondary indexes on SYS_FOREIGN, and they are defined just like below */ - str = (char *) + str = "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" "BEGIN\n" "CREATE TABLE\n" @@ -1040,7 +1080,8 @@ dict_create_or_check_foreign_constraint_tables(void) error = trx->error_state; if (error != DB_SUCCESS) { - fprintf(stderr, "InnoDB: error %lu in creation\n", error); + fprintf(stderr, "InnoDB: error %lu in creation\n", + (ulong) error); ut_a(error == DB_OUT_OF_FILE_SPACE); @@ -1049,15 +1090,15 @@ dict_create_or_check_foreign_constraint_tables(void) fprintf(stderr, "InnoDB: dropping incompletely created SYS_FOREIGN tables\n"); - row_drop_table_for_mysql((char*)"SYS_FOREIGN", trx, TRUE); - row_drop_table_for_mysql((char*)"SYS_FOREIGN_COLS", trx, TRUE); + row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); + row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); error = DB_MUST_GET_MORE_FILE_SPACE; } que_graph_free(graph); - trx->op_info = (char *) ""; + trx->op_info = ""; row_mysql_unlock_data_dictionary(trx); @@ -1120,7 +1161,7 @@ dict_create_add_foreigns_to_dictionary( ut_ad(mutex_own(&(dict_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ - if (NULL == dict_table_get_low((char *) "SYS_FOREIGN")) { + if (NULL == dict_table_get_low("SYS_FOREIGN")) { fprintf(stderr, "InnoDB: table SYS_FOREIGN not found from internal data dictionary\n"); @@ -1139,7 +1180,7 @@ loop: ulint namelen = strlen(table->name); char* id = mem_heap_alloc(foreign->heap, namelen + 20); /* no overflow if number < 1e13 */ - sprintf(id, "%s_ibfk_%lu", table->name, number++); + sprintf(id, "%s_ibfk_%lu", table->name, (ulong) number++); foreign->id = id; } @@ -1180,7 +1221,7 @@ loop: *sqlend++ = '\''; sqlend = ut_strcpyq(sqlend, '\'', foreign->id); *sqlend++ = '\''; *sqlend++ = ','; - sqlend += sprintf(sqlend, "%010lu", i); + sqlend += sprintf(sqlend, "%010lu", (ulong) i); *sqlend++ = ','; *sqlend++ = '\''; sqlend = ut_strcpyq(sqlend, '\'', foreign->foreign_col_names[i]); @@ -1238,7 +1279,7 @@ loop: if (error != DB_SUCCESS) { fprintf(stderr, "InnoDB: Foreign key constraint creation failed:\n" - "InnoDB: internal error number %lu\n", error); + "InnoDB: internal error number %lu\n", (ulong) error); mutex_enter(&dict_foreign_err_mutex); ut_print_timestamp(ef); diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index e2c2043db74..71cf908db4e 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -70,7 +70,7 @@ dict_col_reposition_in_cache( /*=========================*/ dict_table_t* table, /* in: table */ dict_col_t* col, /* in: column */ - char* new_name); /* in: new table name */ + const char* new_name); /* in: new table name */ /************************************************************************** Removes a column from the data dictionary hash table. */ static @@ -198,13 +198,14 @@ dict_tables_have_same_db( /************************************************************************ Return the end of table name where we have removed dbname and '/'. */ static -char* +const char* dict_remove_db_name( /*================*/ - /* out: table name */ - char* name) /* in: table name in the form dbname '/' tablename */ + /* out: table name */ + const char* name) /* in: table name in the form + dbname '/' tablename */ { - char* s; + const char* s; s = strchr(name, '/'); ut_a(s); if (s) s++; @@ -309,7 +310,7 @@ dict_table_get_index_noninline( /*===========================*/ /* out: index, NULL if does not exist */ dict_table_t* table, /* in: table */ - char* name) /* in: index name */ + const char* name) /* in: index name */ { return(dict_table_get_index(table, name)); } @@ -600,7 +601,7 @@ dict_table_get_on_id( } /************************************************************************ -Looks for column n postion in the clustered index. */ +Looks for column n position in the clustered index. */ ulint dict_table_get_nth_col_pos( @@ -614,6 +615,44 @@ dict_table_get_nth_col_pos( n)); } +/************************************************************************ +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. */ + +ibool +dict_table_col_in_clustered_key( +/*============================*/ + /* out: TRUE if the column, or its prefix, is + in the clustered key */ + dict_table_t* table, /* in: table */ + ulint n) /* in: column number */ +{ + dict_index_t* index; + dict_field_t* field; + dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(table); + + col = dict_table_get_nth_col(table, n); + + index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + /************************************************************************** Inits the data dictionary module. */ @@ -655,9 +694,10 @@ directory dict_table_get_low is usually the appropriate function. */ dict_table_t* dict_table_get( /*===========*/ - /* out: table, NULL if does not exist */ - char* table_name, /* in: table name */ - trx_t* trx) /* in: transaction handle or NULL */ + /* out: table, NULL if + does not exist */ + const char* table_name, /* in: table name */ + trx_t* trx) /* in: transaction handle or NULL */ { dict_table_t* table; @@ -684,9 +724,10 @@ Returns a table object and increments MySQL open handle count on the table. */ dict_table_t* dict_table_get_and_increment_handle_count( /*======================================*/ - /* out: table, NULL if does not exist */ - char* table_name, /* in: table name */ - trx_t* trx) /* in: transaction handle or NULL */ + /* out: table, NULL if + does not exist */ + const char* table_name, /* in: table name */ + trx_t* trx) /* in: transaction handle or NULL */ { dict_table_t* table; @@ -704,7 +745,7 @@ dict_table_get_and_increment_handle_count( mutex_exit(&(dict_sys->mutex)); if (table != NULL) { - if (!table->stat_initialized) { + if (!table->stat_initialized && !table->ibd_file_missing) { dict_update_statistics(table); } } @@ -743,23 +784,33 @@ dict_table_add_to_cache( The clustered index will not always physically contain all system columns. */ - dict_mem_table_add_col(table, (char *) "DB_ROW_ID", DATA_SYS, + dict_mem_table_add_col(table, "DB_ROW_ID", DATA_SYS, DATA_ROW_ID, 0, 0); - ut_ad(DATA_ROW_ID == 0); - dict_mem_table_add_col(table, (char *) "DB_TRX_ID", DATA_SYS, +#if DATA_ROW_ID != 0 +#error "DATA_ROW_ID != 0" +#endif + dict_mem_table_add_col(table, "DB_TRX_ID", DATA_SYS, DATA_TRX_ID, 0, 0); - ut_ad(DATA_TRX_ID == 1); - dict_mem_table_add_col(table, (char *) "DB_ROLL_PTR", DATA_SYS, - DATA_ROLL_PTR, - 0, 0); - ut_ad(DATA_ROLL_PTR == 2); +#if DATA_TRX_ID != 1 +#error "DATA_TRX_ID != 1" +#endif + dict_mem_table_add_col(table, "DB_ROLL_PTR", DATA_SYS, + DATA_ROLL_PTR, 0, 0); +#if DATA_ROLL_PTR != 2 +#error "DATA_ROLL_PTR != 2" +#endif - dict_mem_table_add_col(table, (char *) "DB_MIX_ID", DATA_SYS, + dict_mem_table_add_col(table, "DB_MIX_ID", DATA_SYS, DATA_MIX_ID, 0, 0); - ut_ad(DATA_MIX_ID == 3); - ut_ad(DATA_N_SYS_COLS == 4); /* This assert reminds that if a new - system column is added to the program, - it should be dealt with here */ +#if DATA_MIX_ID != 3 +#error "DATA_MIX_ID != 3" +#endif + + /* This check reminds that if a new system column is added to + the program, it should be dealt with here */ +#if DATA_N_SYS_COLS != 4 +#error "DATA_N_SYS_COLS != 4" +#endif /* Look for a table with the same name: error if such exists */ { @@ -848,7 +899,7 @@ dict_table_rename_in_cache( /*=======================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table */ - char* new_name, /* in: new name */ + const char* new_name, /* in: new name */ ibool rename_also_foreigns)/* in: in ALTER TABLE we want to preserve the original table name in constraints which reference it */ @@ -858,6 +909,7 @@ dict_table_rename_in_cache( ulint fold; ulint old_size; char* old_name; + ibool success; ulint i; ut_ad(table); @@ -875,6 +927,21 @@ dict_table_rename_in_cache( HASH_SEARCH(name_hash, dict_sys->table_hash, fold, table2, (ut_strcmp(table2->name, new_name) == 0)); if (table2) { + fprintf(stderr, +"InnoDB: Error: dictionary cache already contains a table of name %s\n", + new_name); + return(FALSE); + } + } + + /* If the table is stored in a single-table tablespace, rename the + .ibd file */ + + if (table->space != 0) { + success = fil_rename_tablespace(table->name, table->space, + new_name); + if (!success) { + return(FALSE); } } @@ -896,7 +963,6 @@ dict_table_rename_in_cache( /* Add table to hash table of tables */ HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, table); - dict_sys->size += (mem_heap_get_size(table->heap) - old_size); /* Update the table_name field in indexes */ @@ -1042,6 +1108,33 @@ dict_table_rename_in_cache( } /************************************************************************** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ + +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /* in: table object already in cache */ + dulint new_id) /* in: new id to set */ +{ + ut_ad(table); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the table from the hash table of id's */ + + HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); + table->id = new_id; + + /* Add the table back to the hash table */ + HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); +} + +/************************************************************************** Removes a table object from the dictionary cache. */ void @@ -1216,7 +1309,7 @@ dict_col_reposition_in_cache( /*=========================*/ dict_table_t* table, /* in: table */ dict_col_t* col, /* in: column */ - char* new_name) /* in: new table name */ + const char* new_name) /* in: new table name */ { ulint fold; @@ -1893,7 +1986,7 @@ dict_foreign_find( /*==============*/ /* out: foreign constraint */ dict_table_t* table, /* in: table object */ - char* id) /* in: foreign constraint id */ + const char* id) /* in: foreign constraint id */ { dict_foreign_t* foreign; @@ -1941,7 +2034,7 @@ dict_foreign_find_index( column types must match */ { dict_index_t* index; - char* col_name; + const char* col_name; ulint i; index = dict_table_get_first_index(table); @@ -2287,7 +2380,7 @@ dict_scan_id( *id = mem_heap_strdupl(heap, s, len); } else { /* no heap given: id will point to source string */ - *id = (char*) s; + *id = s; } return(ptr); @@ -2486,14 +2579,14 @@ static char* dict_strip_comments( /*================*/ - /* out, own: SQL string stripped from - comments; the caller must free this - with mem_free()! */ - char* sql_string) /* in: SQL string */ + /* out, own: SQL string stripped from + comments; the caller must free this + with mem_free()! */ + const char* sql_string) /* in: SQL string */ { - char* str; - char* sptr; - char* ptr; + char* str; + const char* sptr; + char* ptr; str = mem_alloc(strlen(sql_string) + 1); @@ -2670,7 +2763,7 @@ dict_create_foreign_constraints_low( ut_ad(mutex_own(&(dict_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ - table = dict_table_get_low((char*) name); + table = dict_table_get_low(name); if (table == NULL) { mutex_enter(&dict_foreign_err_mutex); @@ -3167,16 +3260,19 @@ allowed to contain more fields than mentioned in the constraint. */ ulint dict_create_foreign_constraints( /*============================*/ - /* out: error code or DB_SUCCESS */ - trx_t* trx, /* in: transaction */ - char* sql_string, /* in: table create or ALTER TABLE - statement where foreign keys are declared like: - FOREIGN KEY (a, b) REFERENCES table2(c, d), - table2 can be written also with the database - name before it: test.table2; the default - database is the database of parameter name */ - char* name) /* in: table full name in the normalized form - database_name/table_name */ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + const char* name) /* in: table full name in the + normalized form + database_name/table_name */ { char* str; ulint err; @@ -3410,7 +3506,7 @@ dict_tree_free( /*===========*/ dict_tree_t* tree) /* in, own: index tree */ { - ut_ad(tree); + ut_a(tree); ut_ad(tree->magic_n == DICT_TREE_MAGIC_N); rw_lock_free(&(tree->lock)); @@ -3425,7 +3521,8 @@ dict_tree_find_index_low( /*=====================*/ /* out: index */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec) /* in: record for which to find correct index */ + rec_t* rec) /* in: record for which to find correct + index */ { dict_index_t* index; dict_table_t* table; @@ -3463,7 +3560,8 @@ dict_tree_find_index( /*=================*/ /* out: index */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec) /* in: record for which to find correct index */ + rec_t* rec) /* in: record for which to find correct + index */ { dict_index_t* index; @@ -3553,7 +3651,8 @@ dict_tree_build_node_ptr( /*=====================*/ /* out, own: node pointer */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec, /* in: record for which to build node pointer */ + rec_t* rec, /* in: record for which to build node + pointer */ ulint page_no,/* in: page number to put in node pointer */ mem_heap_t* heap, /* in: memory heap where pointer created */ ulint level) /* in: level of rec in tree: 0 means leaf @@ -3715,6 +3814,16 @@ dict_update_statistics_low( ulint size; ulint sum_of_index_sizes = 0; + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: cannot calculate statistics for table %s\n" +"InnoDB: because the .ibd file is missing. See section 15.1 of\n" +"InnoDB: http:/www.innodb.com/ibman.html for help\n", table->name); + + return; + } + /* If we have set a high innodb_force_recovery level, do not calculate statistics, as a badly corrupted index can cause a crash in it. */ @@ -3839,7 +3948,7 @@ Prints a table data when we know the table name. */ void dict_table_print_by_name( /*=====================*/ - char* name) + const char* name) { dict_table_t* table; @@ -3877,10 +3986,11 @@ dict_table_print_low( fprintf(stderr, ", id %lu %lu, columns %lu, indexes %lu, appr.rows %lu\n" " COLUMNS: ", - ut_dulint_get_high(table->id), - ut_dulint_get_low(table->id), - table->n_cols, UT_LIST_GET_LEN(table->indexes), - (ulint)table->stat_n_rows); + (ulong) ut_dulint_get_high(table->id), + (ulong) ut_dulint_get_low(table->id), + (ulong) table->n_cols, + (ulong) UT_LIST_GET_LEN(table->indexes), + (ulong) table->stat_n_rows); for (i = 0; i < table->n_cols - 1; i++) { dict_col_print_low(dict_table_get_nth_col(table, i)); @@ -3964,14 +4074,14 @@ dict_index_print_low( " root page %lu, appr.key vals %lu," " leaf pages %lu, size pages %lu\n" " FIELDS: ", - ut_dulint_get_high(tree->id), - ut_dulint_get_low(tree->id), - index->n_user_defined_cols, - index->n_fields, index->type, - tree->page, - (ulint)n_vals, - index->stat_n_leaf_pages, - index->stat_index_size); + (ulong) ut_dulint_get_high(tree->id), + (ulong) ut_dulint_get_low(tree->id), + (ulong) index->n_user_defined_cols, + (ulong) index->n_fields, (ulong) index->type, + (ulong) tree->page, + (ulong) n_vals, + (ulong) index->stat_n_leaf_pages, + (ulong) index->stat_index_size); for (i = 0; i < index->n_fields; i++) { dict_field_print_low(dict_index_get_nth_field(index, i)); @@ -3999,7 +4109,7 @@ dict_field_print_low( ut_print_name(stderr, field->name); if (field->prefix_len != 0) { - fprintf(stderr, "(%lu)", field->prefix_len); + fprintf(stderr, "(%lu)", (ulong) field->prefix_len); } } diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index 6a4d4c86824..ee4ae9dd1a1 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -19,6 +19,7 @@ Created 4/24/1996 Heikki Tuuri #include "mach0data.h" #include "dict0dict.h" #include "dict0boot.h" +#include "srv0start.h" /************************************************************************ Finds the first table name in the given database. */ @@ -26,9 +27,10 @@ Finds the first table name in the given database. */ char* dict_get_first_table_name_in_db( /*============================*/ - /* out, own: table name, NULL if does not exist; - the caller must free the memory in the string! */ - char* name) /* in: database name which ends to '/' */ + /* out, own: table name, NULL if + does not exist; the caller must + free the memory in the string! */ + const char* name) /* in: database name which ends to '/' */ { dict_table_t* sys_tables; btr_pcur_t pcur; @@ -49,7 +51,7 @@ dict_get_first_table_name_in_db( mtr_start(&mtr); - sys_tables = dict_table_get_low((char *) "SYS_TABLES"); + sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); tuple = dtuple_create(heap, 1); @@ -125,7 +127,7 @@ dict_print(void) mtr_start(&mtr); - sys_tables = dict_table_get_low((char *) "SYS_TABLES"); + sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, @@ -185,6 +187,99 @@ loop: } /************************************************************************ +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). +In a normal startup we just scan the biggest space id, and store it to +fil_system. */ + +void +dict_check_tablespaces_or_store_max_id( +/*===================================*/ + ibool in_crash_recovery) /* in: are we doing a crash recovery */ +{ + dict_table_t* sys_tables; + dict_index_t* sys_index; + btr_pcur_t pcur; + rec_t* rec; + byte* field; + ulint len; + ulint space_id; + ulint max_space_id = 0; + mtr_t mtr; + + mutex_enter(&(dict_sys->mutex)); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + + btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, + TRUE, &mtr); +loop: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + /* end of index */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + /* We must make the tablespace cache aware of the biggest + known space id */ + + /* printf("Biggest space id in data dictionary %lu\n", + max_space_id); */ + fil_set_max_space_id_if_bigger(max_space_id); + + mutex_exit(&(dict_sys->mutex)); + + return; + } + + field = rec_get_nth_field(rec, 0, &len); + + if (!rec_get_deleted_flag(rec)) { + + /* We found one */ + + char* name = mem_strdupl(field, len); + + field = rec_get_nth_field(rec, 9, &len); + ut_a(len == 4); + + space_id = mach_read_from_4(field); + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + if (space_id != 0 && in_crash_recovery) { + /* Check that the tablespace (the .ibd file) really + exists; print a warning to the .err log if not */ + + fil_space_for_table_exists_in_mem(space_id, name, + TRUE, TRUE); + } + + mem_free(name); + + if (space_id > max_space_id) { + max_space_id = space_id; + } + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + } + + goto loop; +} + +/************************************************************************ Loads definitions for table columns. */ static void @@ -216,7 +311,7 @@ dict_load_columns( mtr_start(&mtr); - sys_columns = dict_table_get_low((char*) "SYS_COLUMNS"); + sys_columns = dict_table_get_low("SYS_COLUMNS"); sys_index = UT_LIST_GET_FIRST(sys_columns->indexes); tuple = dtuple_create(heap, 1); @@ -246,7 +341,7 @@ dict_load_columns( ut_ad(len == 4); ut_a(i == mach_read_from_4(field)); - ut_a(0 == ut_strcmp((char*) "NAME", + ut_a(0 == ut_strcmp("NAME", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_columns), 4))->name)); @@ -260,10 +355,19 @@ dict_load_columns( field = rec_get_nth_field(rec, 6, &len); prtype = mach_read_from_4(field); + if (dtype_is_non_binary_string_type(mtype, prtype) + && dtype_get_charset_coll(prtype) == 0) { + /* This is a non-binary string type, and the table + was created with < 4.1.2. Use the default charset. */ + + prtype = dtype_form_prtype(prtype, + data_mysql_default_charset_coll); + } + field = rec_get_nth_field(rec, 7, &len); col_len = mach_read_from_4(field); - ut_a(0 == ut_strcmp((char*) "PREC", + ut_a(0 == ut_strcmp("PREC", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_columns), 8))->name)); @@ -285,8 +389,8 @@ Report that an index field or index for a table has been delete marked. */ static void dict_load_report_deleted_index( - char* name, /* in: table name */ - ulint field) /* in: index field, or ULINT_UNDEFINED */ + const char* name, /* in: table name */ + ulint field) /* in: index field, or ULINT_UNDEFINED */ { fputs("InnoDB: Error: data dictionary entry" " for table ", stderr); @@ -333,7 +437,7 @@ dict_load_fields( mtr_start(&mtr); - sys_fields = dict_table_get_low((char*) "SYS_FIELDS"); + sys_fields = dict_table_get_low("SYS_FIELDS"); sys_index = UT_LIST_GET_FIRST(sys_fields->indexes); tuple = dtuple_create(heap, 1); @@ -373,18 +477,18 @@ dict_load_fields( pos_and_prefix_len = mach_read_from_4(field); - ut_a((pos_and_prefix_len & 0xFFFF) == i - || (pos_and_prefix_len & 0xFFFF0000) == (i << 16)); + ut_a((pos_and_prefix_len & 0xFFFFUL) == i + || (pos_and_prefix_len & 0xFFFF0000UL) == (i << 16)); if ((i == 0 && pos_and_prefix_len > 0) - || (pos_and_prefix_len & 0xFFFF0000) > 0) { + || (pos_and_prefix_len & 0xFFFF0000UL) > 0) { - prefix_len = pos_and_prefix_len & 0xFFFF; + prefix_len = pos_and_prefix_len & 0xFFFFUL; } else { prefix_len = 0; } - ut_a(0 == ut_strcmp((char*) "COL_NAME", + ut_a(0 == ut_strcmp("COL_NAME", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_fields), 4))->name)); @@ -446,7 +550,7 @@ dict_load_indexes( mtr_start(&mtr); - sys_indexes = dict_table_get_low((char*) "SYS_INDEXES"); + sys_indexes = dict_table_get_low("SYS_INDEXES"); sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes); tuple = dtuple_create(heap, 1); @@ -489,7 +593,7 @@ dict_load_indexes( ut_ad(len == 8); id = mach_read_from_8(field); - ut_a(0 == ut_strcmp((char*)"NAME", + ut_a(0 == ut_strcmp("NAME", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_indexes), 4))->name)); @@ -506,7 +610,7 @@ dict_load_indexes( field = rec_get_nth_field(rec, 7, &len); space = mach_read_from_4(field); - ut_a(0 == ut_strcmp((char*) "PAGE_NO", + ut_a(0 == ut_strcmp("PAGE_NO", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_indexes), 8))->name)); @@ -549,11 +653,11 @@ dict_load_indexes( && ((type & DICT_CLUSTERED) || ((table == dict_sys->sys_tables) && (name_len == (sizeof "ID_IND") - 1) - && (0 == ut_memcmp(name_buf, (char*)"ID_IND", + && (0 == ut_memcmp(name_buf, "ID_IND", name_len))))) { - /* The index was created in memory already in - booting */ + /* The index was created in memory already at booting + of the database server */ } else { index = dict_mem_index_create(table->name, name_buf, space, type, n_fields); @@ -584,9 +688,15 @@ dictionary cache. */ dict_table_t* dict_load_table( /*============*/ - /* out: table, NULL if does not exist */ - char* name) /* in: table name */ + /* out: table, NULL if does not exist; + if the table is stored in an .ibd file, + but the file does not exist, + then we set the ibd_file_missing flag TRUE + in the table object we return */ + const char* name) /* in: table name in the + databasename/tablename format */ { + ibool ibd_file_missing = FALSE; dict_table_t* table; dict_table_t* sys_tables; btr_pcur_t pcur; @@ -610,7 +720,7 @@ dict_load_table( mtr_start(&mtr); - sys_tables = dict_table_get_low((char *) "SYS_TABLES"); + sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); tuple = dtuple_create(heap, 1); @@ -646,7 +756,7 @@ dict_load_table( return(NULL); } - ut_a(0 == ut_strcmp((char *) "SPACE", + ut_a(0 == ut_strcmp("SPACE", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_tables), 9))->name)); @@ -654,7 +764,24 @@ dict_load_table( field = rec_get_nth_field(rec, 9, &len); space = mach_read_from_4(field); - ut_a(0 == ut_strcmp((char *) "N_COLS", + /* Check if the tablespace exists and has the right name */ + if (space != 0) { + if (fil_space_for_table_exists_in_mem(space, name, FALSE, + FALSE)) { + /* Ok; (if we did a crash recovery then the tablespace + can already be in the memory cache) */ + } else { + /* Try to open the tablespace */ + if (!fil_open_single_table_tablespace(space, name)) { + /* We failed to find a sensible tablespace + file */ + + ibd_file_missing = TRUE; + } + } + } + + ut_a(0 == ut_strcmp("N_COLS", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_tables), 4))->name)); @@ -664,7 +791,9 @@ dict_load_table( table = dict_mem_table_create(name, space, n_cols); - ut_a(0 == ut_strcmp((char *) "ID", + table->ibd_file_missing = ibd_file_missing; + + ut_a(0 == ut_strcmp("ID", dict_field_get_col( dict_index_get_nth_field( dict_table_get_first_index(sys_tables), 3))->name)); @@ -853,7 +982,7 @@ static void dict_load_foreign_cols( /*===================*/ - char* id, /* in: foreign constraint id as a null- + const char* id, /* in: foreign constraint id as a null- terminated string */ dict_foreign_t* foreign)/* in: foreign constraint object */ { @@ -879,7 +1008,7 @@ dict_load_foreign_cols( foreign->n_fields * sizeof(void*)); mtr_start(&mtr); - sys_foreign_cols = dict_table_get_low((char *) "SYS_FOREIGN_COLS"); + sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes); tuple = dtuple_create(foreign->heap, 1); @@ -926,9 +1055,9 @@ static ulint dict_load_foreign( /*==============*/ - /* out: DB_SUCCESS or error code */ - char* id) /* in: foreign constraint id as a null-terminated - string */ + /* out: DB_SUCCESS or error code */ + const char* id) /* in: foreign constraint id as a + null-terminated string */ { dict_foreign_t* foreign; dict_table_t* sys_foreign; @@ -951,7 +1080,7 @@ dict_load_foreign( mtr_start(&mtr); - sys_foreign = dict_table_get_low((char *) "SYS_FOREIGN"); + sys_foreign = dict_table_get_low("SYS_FOREIGN"); sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes); tuple = dtuple_create(heap2, 1); @@ -1011,7 +1140,7 @@ dict_load_foreign( /* We store the type to the bits 24-31 of n_fields */ foreign->type = foreign->n_fields >> 24; - foreign->n_fields = foreign->n_fields & 0xFFFFFF; + foreign->n_fields = foreign->n_fields & 0xFFFFFFUL; foreign->id = mem_heap_strdup(foreign->heap, id); @@ -1057,8 +1186,8 @@ already in the dictionary cache. */ ulint dict_load_foreigns( /*===============*/ - /* out: DB_SUCCESS or error code */ - char* table_name) /* in: table name */ + /* out: DB_SUCCESS or error code */ + const char* table_name) /* in: table name */ { btr_pcur_t pcur; mem_heap_t* heap; @@ -1077,7 +1206,7 @@ dict_load_foreigns( ut_ad(mutex_own(&(dict_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ - sys_foreign = dict_table_get_low((char *) "SYS_FOREIGN"); + sys_foreign = dict_table_get_low("SYS_FOREIGN"); if (sys_foreign == NULL) { /* No foreign keys defined yet in this database */ diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c index 85bd79a72f5..8f05475df47 100644 --- a/innobase/dict/dict0mem.c +++ b/innobase/dict/dict0mem.c @@ -30,15 +30,14 @@ dict_table_t* dict_mem_table_create( /*==================*/ /* out, own: table object */ - char* name, /* in: table name */ - ulint space, /* in: space where the clustered index of + const char* name, /* in: table name */ + ulint space, /* in: space where the clustered index of the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols) /* in: number of columns */ + ulint n_cols) /* in: number of columns */ { dict_table_t* table; - char* str; mem_heap_t* heap; ut_ad(name); @@ -48,12 +47,12 @@ dict_mem_table_create( table = mem_heap_alloc(heap, sizeof(dict_table_t)); table->heap = heap; - - str = mem_heap_strdup(heap, name); table->type = DICT_TABLE_ORDINARY; - table->name = str; + table->name = mem_heap_strdup(heap, name); table->space = space; + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; table->n_def = 0; table->n_cols = n_cols + DATA_N_SYS_COLS; table->mem_fix = 0; @@ -101,11 +100,11 @@ dict_table_t* dict_mem_cluster_create( /*====================*/ /* out, own: cluster object */ - char* name, /* in: cluster name */ - ulint space, /* in: space where the clustered indexes + const char* name, /* in: cluster name */ + ulint space, /* in: space where the clustered indexes of the member tables are placed */ - ulint n_cols, /* in: number of columns */ - ulint mix_len) /* in: length of the common key prefix in the + ulint n_cols, /* in: number of columns */ + ulint mix_len)/* in: length of the common key prefix in the cluster */ { dict_table_t* cluster; @@ -125,7 +124,7 @@ void dict_mem_table_make_cluster_member( /*===============================*/ dict_table_t* table, /* in: non-published table */ - char* cluster_name) /* in: cluster name */ + const char* cluster_name) /* in: cluster name */ { table->type = DICT_TABLE_CLUSTER_MEMBER; table->cluster_name = cluster_name; @@ -138,7 +137,7 @@ void dict_mem_table_add_col( /*===================*/ dict_table_t* table, /* in: table */ - char* name, /* in: column name */ + const char* name, /* in: column name */ ulint mtype, /* in: main datatype */ ulint prtype, /* in: precise type */ ulint len, /* in: length */ @@ -172,14 +171,15 @@ Creates an index memory object. */ dict_index_t* dict_mem_index_create( /*==================*/ - /* out, own: index object */ - char* table_name, /* in: table name */ - char* index_name, /* in: index name */ - ulint space, /* in: space where the index tree is placed, - ignored if the index is of the clustered - type */ - ulint type, /* in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ - ulint n_fields) /* in: number of fields */ + /* out, own: index object */ + const char* table_name, /* in: table name */ + const char* index_name, /* in: index name */ + ulint space, /* in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /* in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /* in: number of fields */ { dict_index_t* index; mem_heap_t* heap; @@ -255,7 +255,7 @@ void dict_mem_index_add_field( /*=====================*/ dict_index_t* index, /* in: index */ - char* name, /* in: column name */ + const char* name, /* in: column name */ ulint order, /* in: order criterion; 0 means an ascending order */ ulint prefix_len) /* in: 0 or the column prefix length diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 1750294ac94..59fbd6f785d 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -1,5 +1,5 @@ /****************************************************** -The low-level file system +The tablespace memory cache (c) 1995 Innobase Oy @@ -16,16 +16,22 @@ Created 10/25/1995 Heikki Tuuri #include "mach0data.h" #include "ibuf0ibuf.h" #include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" #include "log0log.h" #include "log0recv.h" #include "fsp0fsp.h" #include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" + /* - IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM - =========================================== + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= -The file system is responsible for providing fast read/write access to +The tablespace cache is responsible for providing fast read/write access to tablespaces and logs of the database. File creation and deletion is done in other modules which know more of the logic of the operation, however. @@ -88,20 +94,31 @@ ulint fil_n_pending_tablespace_flushes = 0; /* Null file address */ fil_addr_t fil_addr_null = {FIL_NULL, 0}; -/* File system file node data structure */ +/* File node of a tablespace or the log data space */ typedef struct fil_node_struct fil_node_t; struct fil_node_struct { - char* name; /* the file name or path */ + fil_space_t* space; /* backpointer to the space where this node + belongs */ + char* name; /* path to the file */ ibool open; /* TRUE if file open */ os_file_t handle; /* OS handle to the file, if file open */ - ulint size; /* size of the file in database pages - (where the possible last incomplete megabyte - is ignored) */ + ibool is_raw_disk;/* TRUE if the 'file' is actually a raw + device or a raw disk partition */ + ulint size; /* size of the file in database pages, 0 if + not known yet; the possible last incomplete + megabyte is ignored if space == 0 */ ulint n_pending; - /* count of pending i/o-ops on this file */ - ibool is_modified; /* this is set to TRUE when we write - to the file and FALSE when we call fil_flush - for this file space */ + /* count of pending i/o's on this file; + closing of the file is not allowed if + this is > 0 */ + ulint n_pending_flushes; + /* count of pending flushes on this file; + closing of the file is not allowed if + this is > 0 */ + ib_longlong modification_counter;/* when we write to the file we + increment this by one */ + ib_longlong flush_counter;/* up to what modification_counter value + we have flushed the modifications to disk */ UT_LIST_NODE_T(fil_node_t) chain; /* link field for the file chain */ UT_LIST_NODE_T(fil_node_t) LRU; @@ -111,19 +128,52 @@ struct fil_node_struct { #define FIL_NODE_MAGIC_N 89389 -/* File system tablespace or log data structure: let us call them by a common -name space */ +/* Tablespace or log data space: let us call them by a common name space */ struct fil_space_struct { - char* name; /* space name */ + char* name; /* space name = the path to the first file in + it */ ulint id; /* space id */ + ib_longlong tablespace_version; + /* in DISCARD/IMPORT this timestamp is used to + check if we should ignore an insert buffer + merge request for a page because it actually + was for the previous incarnation of the + space */ + ibool mark; /* this is set to TRUE at database startup if + the space corresponds to a table in the InnoDB + data dictionary; so we can print a warning of + orphaned tablespaces */ + ibool stop_ios;/* TRUE if we want to rename the .ibd file of + tablespace and want to stop temporarily + posting of new i/o requests on the file */ + ibool stop_ibuf_merges; + /* we set this TRUE when we start deleting a + single-table tablespace */ + ibool is_being_deleted; + /* this is set to TRUE when we start + deleting a single-table tablespace and its + file; when this flag is set no further i/o + or flush requests can be placed on this space, + though there may be such requests still being + processed on this space */ ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */ UT_LIST_BASE_NODE_T(fil_node_t) chain; /* base node for the file chain */ - ulint size; /* space size in pages */ + ulint size; /* space size in pages; 0 if a single-table + tablespace whose size we do not know yet */ ulint n_reserved_extents; /* number of reserved free extents for ongoing operations like B-tree page split */ + ulint n_pending_flushes; /* this is > 0 when flushing + the tablespace to disk; dropping of the + tablespace is forbidden if this is > 0 */ + ulint n_pending_ibuf_merges;/* this is > 0 when merging + insert buffer entries to a page so that we + may need to access the ibuf bitmap page in the + tablespade: dropping of the tablespace is + forbidden if this is > 0 */ hash_node_t hash; /* hash chain node */ + hash_node_t name_hash;/* hash chain the name_hash table */ rw_lock_t latch; /* latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) space_list; @@ -135,80 +185,126 @@ struct fil_space_struct { #define FIL_SPACE_MAGIC_N 89472 -/* The file system data structure */ +/* The tablespace memory cache; also the totality of logs = the log data space, +is stored here; below we talk about tablespaces, but also the ib_logfiles +form a 'space' and it is handled here */ typedef struct fil_system_struct fil_system_t; struct fil_system_struct { - mutex_t mutex; /* The mutex protecting the system */ + mutex_t mutex; /* The mutex protecting the cache */ hash_table_t* spaces; /* The hash table of spaces in the - system */ + system; they are hashed on the space + id */ + hash_table_t* name_hash; /* hash table based on the space + name */ UT_LIST_BASE_NODE_T(fil_node_t) LRU; /* base node for the LRU list of the - most recently used open files */ - ulint n_open_pending; /* current number of open files with - pending i/o-ops on them */ - ulint max_n_open; /* maximum allowed open files */ - os_event_t can_open; /* this event is set to the signaled - state when the system is capable of - opening a new file, i.e., - n_open_pending < max_n_open */ + most recently used open files with no + pending i/o's; if we start an i/o on + the file, we first remove it from this + list, and return it to the start of + the list when the i/o ends; + log files and the system tablespace are + not put to this list: they are opened + after the startup, and kept open until + shutdown */ + ulint n_open; /* number of files currently open */ + ulint max_n_open; /* n_open is not allowed to exceed + this */ + ib_longlong modification_counter;/* when we write to a file we + increment this by one */ + ulint max_assigned_id;/* maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + ib_longlong tablespace_version; + /* a counter which is incremented for + every space object memory creation; + every space mem object gets a + 'timestamp' from this; in DISCARD/ + IMPORT this is used to check if we + should ignore an insert buffer merge + request */ UT_LIST_BASE_NODE_T(fil_space_t) space_list; /* list of all file spaces */ }; -/* The file system. This variable is NULL before the module is initialized. */ +/* The tablespace memory cache. This variable is NULL before the module is +initialized. */ fil_system_t* fil_system = NULL; -/* The file system hash table size */ -#define FIL_SYSTEM_HASH_SIZE 500 +/* The tablespace memory cache hash table size */ +#define FIL_SYSTEM_HASH_SIZE 50 /* TODO: make bigger! */ -/*********************************************************************** -Reserves a right to open a single file. The right must be released with -fil_release_right_to_open. */ +/************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ +static void -fil_reserve_right_to_open(void) -/*===========================*/ -{ -loop: - mutex_enter(&(fil_system->mutex)); - - if (fil_system->n_open_pending == fil_system->max_n_open) { - - /* It is not sure we can open the file if it is closed: wait */ - - os_event_reset(fil_system->can_open); - - mutex_exit(&(fil_system->mutex)); +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space); /* in: space */ +/************************************************************************ +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type); /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +/*********************************************************************** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. */ +static +ulint +fil_get_space_id_for_table( +/*=======================*/ + /* out: space id, ULINT_UNDEFINED if not + found */ + const char* name); /* in: table name in the standard + 'databasename/tablename' format */ - os_event_wait(fil_system->can_open); - goto loop; - } +/*********************************************************************** +Returns the version number of a tablespace, -1 if not found. */ - fil_system->max_n_open--; +ib_longlong +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ib_longlong version = -1; - mutex_exit(&(fil_system->mutex)); -} + ut_ad(system); -/*********************************************************************** -Releases a right to open a single file. */ + mutex_enter(&(system->mutex)); -void -fil_release_right_to_open(void) -/*===========================*/ -{ - mutex_enter(&(fil_system->mutex)); - - if (fil_system->n_open_pending == fil_system->max_n_open) { + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - os_event_set(fil_system->can_open); + if (space) { + version = space->tablespace_version; } - fil_system->max_n_open++; + mutex_exit(&(system->mutex)); - mutex_exit(&(fil_system->mutex)); + return(version); } /*********************************************************************** @@ -220,8 +316,8 @@ fil_space_get_latch( /* out: latch protecting storage allocation */ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -229,6 +325,8 @@ fil_space_get_latch( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + mutex_exit(&(system->mutex)); return(&(space->latch)); @@ -243,8 +341,8 @@ fil_space_get_type( /* out: FIL_TABLESPACE or FIL_LOG */ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -252,6 +350,8 @@ fil_space_get_type( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + mutex_exit(&(system->mutex)); return(space->purpose); @@ -266,17 +366,21 @@ fil_space_get_ibuf_data( /* out: ibuf data for this space */ ulint id) /* in: space id */ { + fil_system_t* system = fil_system; fil_space_t* space; - fil_system_t* system = fil_system; ut_ad(system); + ut_a(id == 0); + mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); mutex_exit(&(system->mutex)); + ut_a(space); + return(space->ibuf_data); } @@ -286,18 +390,19 @@ Appends a new file to the chain of files of a space. File must be closed. */ void fil_node_create( /*============*/ - char* name, /* in: file name (file must be closed) */ - ulint size, /* in: file size in database blocks, rounded downwards - to an integer */ - ulint id) /* in: space id where to append */ + const char* name, /* in: file name (file must be closed) */ + ulint size, /* in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /* in: space id where to append */ + ibool is_raw) /* in: TRUE if a raw device or + a raw disk partition */ { + fil_system_t* system = fil_system; fil_node_t* node; fil_space_t* space; - fil_system_t* system = fil_system; ut_a(system); ut_a(name); - ut_a(size > 0); mutex_enter(&(system->mutex)); @@ -305,29 +410,123 @@ fil_node_create( node->name = mem_strdup(name); node->open = FALSE; + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->is_raw_disk = is_raw; node->size = size; node->magic_n = FIL_NODE_MAGIC_N; node->n_pending = 0; + node->n_pending_flushes = 0; - node->is_modified = FALSE; + node->modification_counter = 0; + node->flush_counter = 0; HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: Could not find tablespace %lu for\n" +"InnoDB: file %s from the tablespace memory cache.\n", (ulong) id, name); + mem_free(node->name); + + mem_free(node); + + mutex_exit(&(system->mutex)); + + return; + } + space->size += size; + node->space = space; + UT_LIST_ADD_LAST(chain, space->chain, node); mutex_exit(&(system->mutex)); } +/************************************************************************ +Opens a the file of a node of a tablespace. The caller must own the fil_system +mutex. */ +static +void +fil_node_open_file( +/*===============*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space) /* in: space */ +{ + ib_longlong size_bytes; + ulint size_low; + ulint size_high; + ibool ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(system->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(node->n_pending == 0); + ut_a(node->open == FALSE); + + /* printf("Opening file %s\n", node->name); */ + + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, &ret); + } else if (node->is_raw_disk) { + node->handle = os_file_create(node->name, + OS_FILE_OPEN_RAW, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } else { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } + + ut_a(ret); + + node->open = TRUE; + + system->n_open++; + + if (node->size == 0) { + os_file_get_size(node->handle, &size_low, &size_high); + + size_bytes = (((ib_longlong)size_high) << 32) + + (ib_longlong)size_low; +#ifdef UNIV_HOTBACKUP + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + +#else + /* It must be a single-table tablespace and we do not know the + size of the file yet */ + + ut_a(space->id != 0); + + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { + node->size = (ulint) ((size_bytes / (1024 * 1024)) + * ((1024 * 1024) / UNIV_PAGE_SIZE)); + } else { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + } +#endif + space->size += node->size; + } + + if (space->purpose == FIL_TABLESPACE && space->id != 0) { + /* Put the node to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + /************************************************************************** Closes a file. */ static void -fil_node_close( -/*===========*/ +fil_node_close_file( +/*================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system) /* in: file system */ + fil_system_t* system) /* in: tablespace memory cache */ { ibool ret; @@ -337,24 +536,211 @@ fil_node_close( #endif /* UNIV_SYNC_DEBUG */ ut_a(node->open); ut_a(node->n_pending == 0); + ut_a(node->n_pending_flushes == 0); + ut_a(node->modification_counter == node->flush_counter); ret = os_file_close(node->handle); ut_a(ret); + /* printf("Closing file %s\n", node->name); */ + node->open = FALSE; + ut_a(system->n_open > 0); + system->n_open--; + + if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) { + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(LRU, system->LRU, node); + /* The node is in the LRU list, remove it */ + UT_LIST_REMOVE(LRU, system->LRU, node); + } +} + +/************************************************************************ +Tries to close a file in the LRU list. The caller must hold the fil_sys +mutex. */ +static +ibool +fil_try_to_close_file_in_LRU( +/*=========================*/ + /* out: TRUE if success, FALSE if should retry + later; since i/o's generally complete in < + 100 ms, and as InnoDB writes at most 128 pages + from the buffer pool in a batch, and then + immediately flushes the files, there is a good + chance that the next time we find a suitable + node from the LRU list */ + ibool print_info) /* in: if TRUE, prints information why it + cannot close a file */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(system->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + node = UT_LIST_GET_LAST(system->LRU); + + if (print_info) { + fprintf(stderr, +"InnoDB: fil_sys open file LRU len %lu\n", (ulong) UT_LIST_GET_LEN(system->LRU)); + } + + while (node != NULL) { + if (node->modification_counter == node->flush_counter + && node->n_pending_flushes == 0) { + + fil_node_close_file(node, system); + + return(TRUE); + } + + if (print_info && node->n_pending_flushes > 0) { + fprintf(stderr, +"InnoDB: cannot close file %s, because n_pending_flushes %lu\n", node->name, + (ulong) node->n_pending_flushes); + } + + if (print_info + && node->modification_counter != node->flush_counter) { + fprintf(stderr, +"InnoDB: cannot close file %s, because mod_count %lld != fl_count %lld\n", + node->name, node->modification_counter, + node->flush_counter); + } + + node = UT_LIST_GET_PREV(LRU, node); + } + + return(FALSE); } /*********************************************************************** -Frees a file node object from a file system. */ +Reserves the fil_system mutex and tries to make sure we can open at least one +file while holding it. This should be called before calling +fil_node_prepare_for_io(), because that function may need to open a file. */ +static +void +fil_mutex_enter_and_prepare_for_io( +/*===============================*/ + ulint space_id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ibool success; + ibool print_info = FALSE; + ulint count = 0; + ulint count2 = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!mutex_own(&(system->mutex))); +#endif /* UNIV_SYNC_DEBUG */ +retry: + mutex_enter(&(system->mutex)); + + if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + /* We keep log files and system tablespace files always open; + this is important in preventing deadlocks in this module, as + a page read completion often performs another read from the + insert buffer. The insert buffer is in tablespace 0, and we + cannot end up waiting in this function. */ + + return; + } + + if (system->n_open < system->max_n_open) { + + return; + } + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (space != NULL && space->stop_ios) { + /* We are going to do a rename file and want to stop new i/o's + for a while */ + + if (count2 > 20000) { + fprintf(stderr, +"InnoDB: Warning: tablespace %s has i/o ops stopped for a long time %lu\n", + space->name, + (ulong) count2); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + count2++; + + goto retry; + } + + /* If the file is already open, no need to do anything; if the space + does not exist, we handle the situation in the function which called + this function */ + + if (!space || UT_LIST_GET_FIRST(space->chain)->open) { + + return; + } + + if (count > 1) { + print_info = TRUE; + } + + /* Too many files are open, try to close some */ +close_more: + success = fil_try_to_close_file_in_LRU(print_info); + + if (success && system->n_open >= system->max_n_open) { + + goto close_more; + } + + if (system->n_open < system->max_n_open) { + /* Ok */ + + return; + } + + if (count >= 2) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: too many (%lu) files stay open while the maximum\n" +"InnoDB: allowed value would be %lu.\n" +"InnoDB: You may need to raise the value of innodb_max_files_open in\n" +"InnoDB: my.cnf.\n", (ulong) system->n_open, (ulong) system->max_n_open); + + return; + } + + mutex_exit(&(system->mutex)); + +#ifndef UNIV_HOTBACKUP + /* Wake the i/o-handler threads to make sure pending i/o's are + performed */ + os_aio_simulated_wake_handler_threads(); + + os_thread_sleep(20000); +#endif + /* Flush tablespaces so that we can close modified files in the LRU + list */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + count++; + + goto retry; +} + +/*********************************************************************** +Frees a file node object from a tablespace memory cache. */ static void fil_node_free( /*==========*/ fil_node_t* node, /* in, own: file node */ - fil_system_t* system, /* in: file system */ + fil_system_t* system, /* in: tablespace memory cache */ fil_space_t* space) /* in: space where the file node is chained */ { ut_ad(node && system && space); @@ -362,9 +748,15 @@ fil_node_free( ut_ad(mutex_own(&(system->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(node->magic_n == FIL_NODE_MAGIC_N); + ut_a(node->n_pending == 0); if (node->open) { - fil_node_close(node, system); + /* We fool the assertion in fil_node_close_file() to think + there are no unflushed modifications in the file */ + + node->modification_counter = node->flush_counter; + + fil_node_close_file(node, system); } space->size -= node->size; @@ -387,9 +779,9 @@ fil_space_truncate_start( if this does not equal to the combined size of some initial files in the space */ { + fil_system_t* system = fil_system; fil_node_t* node; fil_space_t* space; - fil_system_t* system = fil_system; mutex_enter(&(system->mutex)); @@ -398,7 +790,6 @@ fil_space_truncate_start( ut_a(space); while (trunc_len > 0) { - node = UT_LIST_GET_FIRST(space->chain); ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len); @@ -409,17 +800,341 @@ fil_space_truncate_start( } mutex_exit(&(system->mutex)); -} +} + +/*********************************************************************** +Creates a space memory object and puts it to the tablespace memory cache. If +there is an error, prints an error message to the .err log. */ + +ibool +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + const char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ulint namesake_id; +try_again: + /*printf( + "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name, + purpose);*/ + + ut_a(system); + ut_a(name); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(name), space, + 0 == strcmp(name, space->name)); + if (space != NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to init to the tablespace memory cache\n" +"InnoDB: a tablespace %lu of name %s,\n" +"InnoDB: but a tablespace %lu of the same name %s\n" +"InnoDB: already exists in the tablespace memory cache!\n", + (ulong) id, name, + (ulong) space->id, space->name); + + if (id == 0 || purpose != FIL_TABLESPACE) { + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + fprintf(stderr, +"InnoDB: We assume that InnoDB did a crash recovery, and you had\n" +"InnoDB: an .ibd file for which the table did not exist in the\n" +"InnoDB: InnoDB internal data dictionary in the ibdata files.\n" +"InnoDB: We assume that you later removed the .ibd and .frm files,\n" +"InnoDB: and are now trying to recreate the table. We now remove the\n" +"InnoDB: conflicting tablespace object from the memory cache and try\n" +"InnoDB: the init again.\n"); + + namesake_id = space->id; + + mutex_exit(&(system->mutex)); + + fil_space_free(namesake_id); + + goto try_again; + } + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space != NULL) { + fprintf(stderr, +"InnoDB: Error: trying to add tablespace %lu of name %s\n" +"InnoDB: to the tablespace memory cache, but tablespace\n" +"InnoDB: %lu of name %s already exists in the tablespace\n" +"InnoDB: memory cache!\n", (ulong) id, name, (ulong) space->id, space->name); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + space = mem_alloc(sizeof(fil_space_t)); + + space->name = mem_strdup(name); + space->id = id; + + system->tablespace_version++; + space->tablespace_version = system->tablespace_version; + space->mark = FALSE; + + if (purpose == FIL_TABLESPACE && id > system->max_assigned_id) { + system->max_assigned_id = id; + } + + space->stop_ios = FALSE; + space->stop_ibuf_merges = FALSE; + space->is_being_deleted = FALSE; + space->purpose = purpose; + space->size = 0; + + space->n_reserved_extents = 0; + + space->n_pending_flushes = 0; + space->n_pending_ibuf_merges = 0; + + UT_LIST_INIT(space->chain); + space->magic_n = FIL_SPACE_MAGIC_N; + + space->ibuf_data = NULL; + + rw_lock_create(&(space->latch)); + rw_lock_set_level(&(space->latch), SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(name), space); + UT_LIST_ADD_LAST(space_list, system->space_list, space); + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. */ +static +ulint +fil_assign_new_space_id(void) +/*=========================*/ + /* out: new tablespace id; ULINT_UNDEFINED if could + not assign an id */ +{ + fil_system_t* system = fil_system; + ulint id; + + mutex_enter(&(system->mutex)); + + system->max_assigned_id++; + + id = system->max_assigned_id; + + if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Warning: you are running out of new single-table tablespace id's.\n" +"InnoDB: Current counter is %lu and it must not exceed %lu!\n" +"InnoDB: To reset the counter to zero you have to dump all your tables and\n" +"InnoDB: recreate the whole InnoDB installation.\n", (ulong) id, + (ulong) SRV_LOG_SPACE_FIRST_ID); + } + + if (id >= SRV_LOG_SPACE_FIRST_ID) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: You have run out of single-table tablespace id's!\n" +"InnoDB: Current counter is %lu.\n" +"InnoDB: To reset the counter to zero you have to dump all your tables and\n" +"InnoDB: recreate the whole InnoDB installation.\n", (ulong) id); + system->max_assigned_id--; + + id = ULINT_UNDEFINED; + } + + mutex_exit(&(system->mutex)); + + return(id); +} + +/*********************************************************************** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. */ + +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_space_t* namespace; + fil_node_t* fil_node; + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: trying to remove tablespace %lu from the cache but\n" +"InnoDB: it is not there.\n", (ulong) id); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(space->name), + namespace, 0 == strcmp(space->name, namespace->name)); + ut_a(namespace); + ut_a(space == namespace); + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + + UT_LIST_REMOVE(space_list, system->space_list, space); + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(0 == space->n_pending_flushes); + + fil_node = UT_LIST_GET_FIRST(space->chain); + + while (fil_node != NULL) { + fil_node_free(fil_node, system, space); + + fil_node = UT_LIST_GET_FIRST(space->chain); + } + + ut_a(0 == UT_LIST_GET_LEN(space->chain)); + + mutex_exit(&(system->mutex)); + + rw_lock_free(&(space->latch)); + + mem_free(space->name); + mem_free(space); + + return(TRUE); +} + +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Returns the tablespace object for a given id, or NULL if not found from the +tablespace memory cache. */ +static +fil_space_t* +fil_get_space_for_id_low( +/*=====================*/ + /* out: tablespace object or NULL; NOTE that you must + own &(fil_system->mutex) to call this function! */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + return(space); +} +#endif + +/*********************************************************************** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ + +ulint +fil_space_get_size( +/*===============*/ + /* out: space size, 0 if space not found */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + ulint size; + + ut_ad(system); + + fil_mutex_enter_and_prepare_for_io(id); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + mutex_exit(&(system->mutex)); + + return(0); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + fil_node_prepare_for_io(node, system, space); + fil_node_complete_io(node, system, OS_FILE_READ); + } + + size = space->size; + + mutex_exit(&(system->mutex)); + + return(size); +} + +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no)/* in: page number */ +{ + if (fil_space_get_size(id) > page_no) { + + return(TRUE); + } + + return(FALSE); +} /******************************************************************** -Creates a file system object. */ +Creates a the tablespace memory cache. */ static fil_system_t* fil_system_create( /*==============*/ - /* out, own: file system object */ + /* out, own: tablespace memory cache */ ulint hash_size, /* in: hash table size */ - ulint max_n_open) /* in: maximum number of open files */ + ulint max_n_open) /* in: maximum number of open files; must be + > 10 */ { fil_system_t* system; @@ -433,12 +1148,17 @@ fil_system_create( mutex_set_level(&(system->mutex), SYNC_ANY_LATCH); system->spaces = hash_create(hash_size); + system->name_hash = hash_create(hash_size); UT_LIST_INIT(system->LRU); - system->n_open_pending = 0; + system->n_open = 0; system->max_n_open = max_n_open; - system->can_open = os_event_create(NULL); + + system->modification_counter = 0; + system->max_assigned_id = 0; + + system->tablespace_version = 0; UT_LIST_INIT(system->space_list); @@ -446,7 +1166,7 @@ fil_system_create( } /******************************************************************** -Initializes the file system of this module. */ +Initializes the tablespace memory cache. */ void fil_init( @@ -455,11 +1175,120 @@ fil_init( { ut_a(fil_system == NULL); + /*printf("Initializing the tablespace cache with max %lu open files\n", + max_n_open); */ fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open); } +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ + +void +fil_open_log_and_system_tablespace_files(void) +/*==========================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + if (space->purpose != FIL_TABLESPACE || space->id == 0) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (!node->open) { + fil_node_open_file(node, system, + space); + } + if (system->max_n_open < 10 + system->n_open) { + fprintf(stderr, +"InnoDB: Warning: you must raise the value of innodb_max_open_files in\n" +"InnoDB: my.cnf! Remember that InnoDB keeps all log files and all system\n" +"InnoDB: tablespace files open for the whole time mysqld is running, and\n" +"InnoDB: needs to open also some .ibd files if the file-per-table storage\n" +"InnoDB: model is used. Current open files %lu, max allowed open files %lu.\n", + (ulong) system->n_open, + (ulong) system->max_n_open); + } + node = UT_LIST_GET_NEXT(chain, node); + } + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ + +void +fil_close_all_files(void) +/*=====================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (node->open) { + fil_node_close_file(node, system); + } + node = UT_LIST_GET_NEXT(chain, node); + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ + +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id) /* in: maximum known id */ +{ + fil_system_t* system = fil_system; + + if (max_id >= SRV_LOG_SPACE_FIRST_ID) { + fprintf(stderr, +"InnoDB: Fatal error: max tablespace id is too high, %lu\n", (ulong) max_id); + ut_a(0); + } + + mutex_enter(&(system->mutex)); + + if (system->max_assigned_id < max_id) { + + system->max_assigned_id = max_id; + } + + mutex_exit(&(system->mutex)); +} + /******************************************************************** -Writes the flushed lsn to the header of each file space. */ +Initializes the ibuf data structure for space 0 == the system tablespace. +This can be called after the file space headers have been created and the +dictionary system has been initialized. */ void fil_ibuf_init_at_db_start(void) @@ -468,28 +1297,26 @@ fil_ibuf_init_at_db_start(void) fil_space_t* space; space = UT_LIST_GET_FIRST(fil_system->space_list); - - while (space) { - if (space->purpose == FIL_TABLESPACE) { - space->ibuf_data = ibuf_data_init_for_space(space->id); - } - space = UT_LIST_GET_NEXT(space_list, space); - } + ut_a(space); + ut_a(space->purpose == FIL_TABLESPACE); + + space->ibuf_data = ibuf_data_init_for_space(space->id); } /******************************************************************** -Writes the flushed lsn and the latest archived log number to the page -header of the first page of a data file. */ +Writes the flushed lsn and the latest archived log number to the page header +of the first page of a data file. */ static ulint fil_write_lsn_and_arch_no_to_file( /*==============================*/ ulint space_id, /* in: space number */ - ulint sum_of_sizes, /* in: combined size of previous files in space, - in database pages */ + ulint sum_of_sizes, /* in: combined size of previous files in + space, in database pages */ dulint lsn, /* in: lsn to write */ - ulint arch_log_no) /* in: archived log number to write */ + ulint arch_log_no /* in: archived log number to write */ + __attribute__((unused))) { byte* buf1; byte* buf; @@ -500,7 +1327,6 @@ fil_write_lsn_and_arch_no_to_file( fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); - mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no); fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); @@ -509,7 +1335,7 @@ fil_write_lsn_and_arch_no_to_file( /******************************************************************** Writes the flushed lsn and the latest archived log number to the page -header of the first page of each data file. */ +header of the first page of each data file in the system tablespace. */ ulint fil_write_flushed_lsn_to_data_files( @@ -528,18 +1354,22 @@ fil_write_flushed_lsn_to_data_files( space = UT_LIST_GET_FIRST(fil_system->space_list); while (space) { + /* We only write the lsn to all existing data files which have + been open during the lifetime of the mysqld process; they are + represented by the space objects in the tablespace memory + cache. Note that all data files in the system tablespace 0 are + always open. */ + if (space->purpose == FIL_TABLESPACE) { sum_of_sizes = 0; node = UT_LIST_GET_FIRST(space->chain); - while (node) { mutex_exit(&(fil_system->mutex)); err = fil_write_lsn_and_arch_no_to_file( - space->id, - sum_of_sizes, - lsn, arch_log_no); + space->id, sum_of_sizes, + lsn, arch_log_no); if (err != DB_SUCCESS) { return(err); @@ -548,11 +1378,9 @@ fil_write_flushed_lsn_to_data_files( mutex_enter(&(fil_system->mutex)); sum_of_sizes += node->size; - node = UT_LIST_GET_NEXT(chain, node); } } - space = UT_LIST_GET_NEXT(space_list, space); } @@ -571,15 +1399,16 @@ fil_read_flushed_lsn_and_arch_log_no( os_file_t data_file, /* in: open data file */ ibool one_read_already, /* in: TRUE if min and max parameters below already contain sensible data */ - dulint* min_flushed_lsn, /* in/out: */ +#ifdef UNIV_LOG_ARCHIVE ulint* min_arch_log_no, /* in/out: */ - dulint* max_flushed_lsn, /* in/out: */ - ulint* max_arch_log_no) /* in/out: */ + ulint* max_arch_log_no, /* in/out: */ +#endif /* UNIV_LOG_ARCHIVE */ + dulint* min_flushed_lsn, /* in/out: */ + dulint* max_flushed_lsn) /* in/out: */ { byte* buf; byte* buf2; dulint flushed_lsn; - ulint arch_log_no; buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); /* Align the memory for a possible read from a raw device */ @@ -588,16 +1417,16 @@ fil_read_flushed_lsn_and_arch_log_no( os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE); flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN); - arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO); ut_free(buf2); if (!one_read_already) { *min_flushed_lsn = flushed_lsn; *max_flushed_lsn = flushed_lsn; +#ifdef UNIV_LOG_ARCHIVE *min_arch_log_no = arch_log_no; *max_arch_log_no = arch_log_no; - +#endif /* UNIV_LOG_ARCHIVE */ return; } @@ -607,116 +1436,1516 @@ fil_read_flushed_lsn_and_arch_log_no( if (ut_dulint_cmp(*max_flushed_lsn, flushed_lsn) < 0) { *max_flushed_lsn = flushed_lsn; } +#ifdef UNIV_LOG_ARCHIVE if (*min_arch_log_no > arch_log_no) { *min_arch_log_no = arch_log_no; } if (*max_arch_log_no < arch_log_no) { *max_arch_log_no = arch_log_no; } +#endif /* UNIV_LOG_ARCHIVE */ } +/*================ SINGLE-TABLE TABLESPACES ==========================*/ + /*********************************************************************** -Creates a space object and puts it to the file system. */ +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ + +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: trying to do ibuf merge to a dropped tablespace %lu\n", + (ulong) id); + } + + if (space == NULL || space->stop_ibuf_merges) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + space->n_pending_ibuf_merges++; + + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Decrements the count of pending insert buffer page merges. */ void -fil_space_create( +fil_decr_pending_ibuf_merges( +/*========================*/ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: decrementing ibuf merge of a dropped tablespace %lu\n", + (ulong) id); + } + + if (space != NULL) { + space->n_pending_ibuf_merges--; + } + + mutex_exit(&(system->mutex)); +} + +static +void +fil_create_directory_for_tablename( +/*===============================*/ + const char* name) /* in: name in the standard + 'databasename/tablename' format */ +{ + const char* namend; + char* path; + ulint len; + + len = strlen(fil_path_to_mysql_datadir); + namend = strchr(name, '/'); + ut_a(namend); + path = mem_alloc(len + (namend - name) + 2); + + memcpy(path, fil_path_to_mysql_datadir, len); + path[len] = '/'; + memcpy(path + len + 1, name, namend - name); + path[len + (namend - name) + 1] = 0; + + srv_normalize_path_for_win(path); + + ut_a(os_file_create_directory(path, FALSE)); + mem_free(path); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************ +Writes a log record about an .ibd file create/rename/delete. */ +static +void +fil_op_write_log( /*=============*/ - char* name, /* in: space name */ - ulint id, /* in: space id */ - ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ + ulint type, /* in: MLOG_FILE_CREATE, + MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id, /* in: space id */ + const char* name, /* in: table name in the familiar + 'databasename/tablename' format, or + the file path in the case of + MLOG_FILE_DELETE */ + const char* new_name, /* in: if type is MLOG_FILE_RENAME, + the new table name in the + 'databasename/tablename' format */ + mtr_t* mtr) /* in: mini-transaction handle */ { - fil_space_t* space; - fil_system_t* system = fil_system; + byte* log_ptr; + + log_ptr = mlog_open(mtr, 30); - ut_a(system); - ut_a(name); + log_ptr = mlog_write_initial_log_record_for_file_op(type, space_id, 0, + log_ptr, mtr); + /* Let us store the strings as null-terminated for easier readability + and handling */ + + mach_write_to_2(log_ptr, ut_strlen(name) + 1); + log_ptr += 2; + + mlog_close(mtr, log_ptr); -#ifndef UNIV_BASIC_LOG_DEBUG - /* Spaces with an odd id number are reserved to replicate spaces - used in log debugging */ + mlog_catenate_string(mtr, (byte*) name, ut_strlen(name) + 1); + + if (type == MLOG_FILE_RENAME) { + log_ptr = mlog_open(mtr, 30); + mach_write_to_2(log_ptr, ut_strlen(new_name) + 1); + log_ptr += 2; - ut_a((purpose == FIL_LOG) || (id % 2 == 0)); + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) new_name, + ut_strlen(new_name) + 1); + } +} #endif - mutex_enter(&(system->mutex)); - space = mem_alloc(sizeof(fil_space_t)); +/*********************************************************************** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the +datadir that we should use in replaying the file operations. */ + +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + /* out: end of log record, or NULL if the + record was not completely contained between + ptr and end_ptr */ + byte* ptr, /* in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /* in: buffer end */ + ulint type, /* in: the type of this log record */ + ibool do_replay, /* in: TRUE if we want to replay the + operation, and not just parse the log record */ + ulint space_id) /* in: if do_replay is TRUE, the space id of + the tablespace in question; otherwise + ignored */ +{ + ulint name_len; + ulint new_name_len; + const char* name; + const char* new_name = NULL; - space->name = mem_strdup(name); - space->id = id; - space->purpose = purpose; - space->size = 0; + if (end_ptr < ptr + 2) { - space->n_reserved_extents = 0; + return(NULL); + } + + name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + name_len) { + + return(NULL); + } + + name = (const char*) ptr; + + ptr += name_len; + + if (type == MLOG_FILE_RENAME) { + if (end_ptr < ptr + 2) { + + return(NULL); + } + + new_name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + new_name_len) { + + return(NULL); + } + + new_name = (const char*) ptr; + + ptr += new_name_len; + } + + /* We managed to parse a full log record body */ +/* + printf("Parsed log rec of type %lu space %lu\n" + "name %s\n", type, space_id, name); + + if (type == MLOG_FILE_RENAME) { + printf("new name %s\n", new_name); + } +*/ + if (do_replay == FALSE) { + + return(ptr); + } + + /* Let us try to perform the file operation, if sensible. Note that + ibbackup has at this stage already read in all space id info to the + fil0fil.c data structures. - UT_LIST_INIT(space->chain); - space->magic_n = FIL_SPACE_MAGIC_N; + NOTE that our algorithm is not guaranteed to work correctly if there + were renames of tables during the backup. See ibbackup code for more + on the problem. */ - space->ibuf_data = NULL; + if (type == MLOG_FILE_DELETE) { + if (fil_tablespace_exists_in_mem(space_id)) { + ut_a(fil_delete_tablespace(space_id)); + } + } else if (type == MLOG_FILE_RENAME) { + /* We do the rename based on space id, not old file name; + this should guarantee that after the log replay each .ibd file + has the correct name for the latest log sequence number; the + proof is left as an exercise :) */ + + if (fil_tablespace_exists_in_mem(space_id)) { + /* Create the database directory for the new name, if + it does not exist yet */ + fil_create_directory_for_tablename(new_name); - rw_lock_create(&(space->latch)); - rw_lock_set_level(&(space->latch), SYNC_FSP); + /* Rename the table if there is not yet a tablespace + with the same name */ + + if (fil_get_space_id_for_table(new_name) + == ULINT_UNDEFINED) { + /* We do not care of the old name, that is + why we pass NULL as the first argument */ + ut_a(fil_rename_tablespace(NULL, space_id, + new_name)); + } + } + } else { + ut_a(type == MLOG_FILE_CREATE); + + if (fil_tablespace_exists_in_mem(space_id)) { + /* Do nothing */ + } else if (fil_get_space_id_for_table(name) != + ULINT_UNDEFINED) { + /* Do nothing */ + } else { + /* Create the database directory for name, if it does + not exist yet */ + fil_create_directory_for_tablename(name); + + ut_a(space_id != 0); + + ut_a(DB_SUCCESS == + fil_create_new_single_table_tablespace( + &space_id, name, + FIL_IBD_FILE_INITIAL_SIZE)); + } + } + + return(ptr); +} + +/*********************************************************************** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ + +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char* path; + + ut_a(id != 0); +stop_ibuf_merges: + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space != NULL) { + space->stop_ibuf_merges = TRUE; + + if (space->n_pending_ibuf_merges == 0) { + mutex_exit(&(system->mutex)); + + count = 0; + + goto try_again; + } else { + if (count > 5000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to delete tablespace %s,\n" +"InnoDB: but there are %lu pending ibuf merges on it.\n" +"InnoDB: Loop %lu.\n", space->name, (ulong) space->n_pending_ibuf_merges, + (ulong) count); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + count++; + + goto stop_ibuf_merges; + } + } + + mutex_exit(&(system->mutex)); + count = 0; + +try_again: + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: cannot delete tablespace %lu\n" +"InnoDB: because it is not found in the tablespace memory cache.\n", + (ulong) id); + + mutex_exit(&(system->mutex)); - HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + return(FALSE); + } + + ut_a(space); + ut_a(space->n_pending_ibuf_merges == 0); + + space->is_being_deleted = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || node->n_pending > 0) { + if (count > 1000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to delete tablespace %s,\n" +"InnoDB: but there are %lu flushes and %lu pending i/o's on it\n" +"InnoDB: Loop %lu.\n", space->name, (ulong) space->n_pending_flushes, + (ulong) node->n_pending, + (ulong) count); + } + mutex_exit(&(system->mutex)); + os_thread_sleep(20000); + + count++; + + goto try_again; + } + + path = mem_strdup(space->name); - UT_LIST_ADD_LAST(space_list, system->space_list, space); - mutex_exit(&(system->mutex)); +#ifndef UNIV_HOTBACKUP + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->is_being_deleted = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag is_being_deleted also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_invalidate_tablespace(id); +#endif + /* printf("Deleting tablespace %s id %lu\n", space->name, id); */ + + success = fil_space_free(id); + + if (success) { + success = os_file_delete(path); + } + + if (success) { +#ifndef UNIV_HOTBACKUP + /* Write a log record about the deletion of the .ibd + file, so that ibbackup can replay it in the + --apply-log phase. We use a dummy mtr and the familiar + log write mechanism. */ + mtr_t mtr; + + /* When replaying the operation in ibbackup, do not try + to write any log record */ + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_DELETE, id, path, NULL, &mtr); + mtr_commit(&mtr); +#endif + mem_free(path); + + return(TRUE); + } + + mem_free(path); + + return(FALSE); } /*********************************************************************** -Frees a space object from a file system. Closes the files in the chain -but does not delete them. */ +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ -void -fil_space_free( -/*===========*/ +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ ulint id) /* in: space id */ { + ibool success; + + success = fil_delete_tablespace(id); + + if (!success) { + fprintf(stderr, +"InnoDB: Warning: cannot delete tablespace %lu in DISCARD TABLESPACE.\n" +"InnoDB: But let us remove the insert buffer entries for this tablespace.\n", + (ulong) id); + } + + /* Remove all insert buffer entries for the tablespace */ + + ibuf_delete_for_discarded_space(id); + + return(TRUE); +} + +/*********************************************************************** +Renames the memory cache structures of a single-table tablespace. */ +static +ibool +fil_rename_tablespace_in_mem( +/*=========================*/ + /* out: TRUE if success */ + fil_space_t* space, /* in: tablespace memory object */ + fil_node_t* node, /* in: file node of that tablespace */ + const char* path) /* in: new name */ +{ + fil_system_t* system = fil_system; + fil_space_t* space2; + const char* old_name = space->name; + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(old_name), + space2, 0 == strcmp(old_name, space2->name)); + if (space != space2) { + fprintf(stderr, +"InnoDB: Error: cannot find %s in tablespace memory cache\n", old_name); + + return(FALSE); + } + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(path), + space2, 0 == strcmp(path, space2->name)); + if (space2 != NULL) { + fprintf(stderr, +"InnoDB: Error: %s is already in tablespace memory cache\n", path); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + mem_free(space->name); + mem_free(node->name); + + space->name = mem_strdup(path); + node->name = mem_strdup(path); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(path), space); + return(TRUE); +} + +/*********************************************************************** +Allocates a file name for a single-table tablespace. +The string must be freed by caller with mem_free(). */ +static +char* +fil_make_ibd_name( +/*==============*/ + /* out, own: file name */ + const char* name) /* in: table name */ +{ + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + char* filename = mem_alloc(namelen + dirlen + sizeof "/.ibd"); + + memcpy(filename, fil_path_to_mysql_datadir, dirlen); + filename[dirlen] = '/'; + memcpy(filename + dirlen + 1, name, namelen); + memcpy(filename + dirlen + namelen + 1, ".ibd", sizeof ".ibd"); + + srv_normalize_path_for_win(filename); + return(filename); +} + +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ + +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + const char* old_name, /* in: old table name in the standard + databasename/tablename format of + InnoDB, or NULL if we do the rename + based on the space id only */ + ulint id, /* in: space id */ + const char* new_name) /* in: new table name in the standard + databasename/tablename format + of InnoDB */ +{ + fil_system_t* system = fil_system; + ibool success; fil_space_t* space; - fil_node_t* fil_node; - fil_system_t* system = fil_system; + fil_node_t* node; + ulint count = 0; + char* path; + ibool old_name_was_specified = TRUE; + char* old_path; + + ut_a(id != 0); + if (old_name == NULL) { + old_name = "(name not specified)"; + old_name_was_specified = FALSE; + } +retry: + count++; + + if (count > 1000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: problems renaming %s to %s, %lu iterations\n", + old_name, new_name, + (ulong) count); + } + mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: cannot find space id %lu from the tablespace memory cache\n" +"InnoDB: though the table %s in a rename operation should have that id\n", + (ulong) id, old_name); + mutex_exit(&(system->mutex)); - UT_LIST_REMOVE(space_list, system->space_list, space); + return(FALSE); + } - ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + if (count > 25000) { + space->stop_ios = FALSE; + mutex_exit(&(system->mutex)); - fil_node = UT_LIST_GET_FIRST(space->chain); + return(FALSE); + } - ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain)); + /* We temporarily close the .ibd file because we do not trust that + operating systems can rename an open file. For the closing we have to + wait until there are no pending i/o's or flushes on the file. */ - while (fil_node != NULL) { - fil_node_free(fil_node, system, space); + space->stop_ios = TRUE; - fil_node = UT_LIST_GET_FIRST(space->chain); + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (node->n_pending > 0 || node->n_pending_flushes > 0) { + /* There are pending i/o's or flushes, sleep for a while and + retry */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + goto retry; + + } else if (node->modification_counter > node->flush_counter) { + /* Flush the space */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + fil_flush(id); + + goto retry; + + } else if (node->open) { + /* Close the file */ + + fil_node_close_file(node, system); + } + + /* Check that the old name in the space is right */ + + if (old_name_was_specified) { + old_path = fil_make_ibd_name(old_name); + + ut_a(strcmp(space->name, old_path) == 0); + ut_a(strcmp(node->name, old_path) == 0); + } else { + old_path = mem_strdup(space->name); + } + + /* Rename the tablespace and the node in the memory cache */ + path = fil_make_ibd_name(new_name); + success = fil_rename_tablespace_in_mem(space, node, path); + + if (success) { + success = os_file_rename(old_path, path); + + if (!success) { + /* We have to revert the changes we made + to the tablespace memory cache */ + + ut_a(fil_rename_tablespace_in_mem(space, node, + old_path)); + } + } + + mem_free(path); + mem_free(old_path); + + space->stop_ios = FALSE; + + mutex_exit(&(system->mutex)); + +#ifndef UNIV_HOTBACKUP + if (success) { + mtr_t mtr; + + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_RENAME, id, old_name, new_name, + &mtr); + mtr_commit(&mtr); + } +#endif + return(success); +} + +/*********************************************************************** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. */ + +ulint +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* in/out: space id; if this is != 0, + then this is an input parameter, + otherwise output */ + const char* tablename, /* in: the table name in the usual + databasename/tablename format + of InnoDB */ + ulint size) /* in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ +{ + os_file_t file; + ibool ret; + ulint err; + byte* buf2; + byte* page; + ibool success; + char* path; + + ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); + + path = fil_make_ibd_name(tablename); + + file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + if (ret == FALSE) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error creating file %s.\n", path); + + /* The following call will print an error message */ + + err = os_file_get_last_error(TRUE); + + if (err == OS_FILE_ALREADY_EXISTS) { + fprintf(stderr, +"InnoDB: The file already exists though the corresponding table did not\n" +"InnoDB: exist in the InnoDB data dictionary. Have you moved InnoDB\n" +"InnoDB: .ibd files around without using the SQL commands\n" +"InnoDB: DISCARD TABLESPACE and IMPORT TABLESPACE, or did\n" +"InnoDB: mysqld crash in the middle of CREATE TABLE? You can\n" +"InnoDB: resolve the problem by removing the file %s\n" +"InnoDB: under the 'datadir' of MySQL.\n", path); + + mem_free(path); + return(DB_TABLESPACE_ALREADY_EXISTS); + } + + if (err == OS_FILE_DISK_FULL) { + + mem_free(path); + return(DB_OUT_OF_FILE_SPACE); + } + + mem_free(path); + return(DB_ERROR); + } + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0); + + if (!ret) { + ut_free(buf2); + os_file_close(file); + os_file_delete(path); + + mem_free(path); + return(DB_OUT_OF_FILE_SPACE); + } + + if (*space_id == 0) { + *space_id = fil_assign_new_space_id(); + } + + /* printf("Creating tablespace %s id %lu\n", path, *space_id); */ + + if (*space_id == ULINT_UNDEFINED) { + ut_free(buf2); + error_exit: + os_file_close(file); + error_exit2: + os_file_delete(path); + + mem_free(path); + return(DB_ERROR); + } + + /* We have to write the space id to the file immediately and flush the + file to disk. This is because in crash recovery we must be aware what + tablespaces exist and what are their space id's, so that we can apply + the log records to the right file. It may take quite a while until + buffer pool flush algorithms write anything to the file and flush it to + disk. If we would not write here anything, the file would be filled + with zeros from the call of os_file_set_size(), until a buffer pool + flush would write to it. */ + + memset(page, '\0', UNIV_PAGE_SIZE); + + fsp_header_write_space_id(page, *space_id); + + buf_flush_init_for_writing(page, ut_dulint_zero, *space_id, 0); + + ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE); + + ut_free(buf2); + + if (!ret) { + fprintf(stderr, +"InnoDB: Error: could not write the first page to tablespace %s\n", path); + goto error_exit; + } + + ret = os_file_flush(file); + + if (!ret) { + fprintf(stderr, +"InnoDB: Error: file flush of tablespace %s failed\n", path); + goto error_exit; + } + + os_file_close(file); + + if (*space_id == ULINT_UNDEFINED) { + goto error_exit2; + } + + success = fil_space_create(path, *space_id, FIL_TABLESPACE); + + if (!success) { + goto error_exit2; } + + fil_node_create(path, size, *space_id, FALSE); + +#ifndef UNIV_HOTBACKUP + { + mtr_t mtr; + + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_CREATE, *space_id, tablename, NULL, &mtr); + + mtr_commit(&mtr); + } +#endif + mem_free(path); + return(DB_SUCCESS); +} + +/************************************************************************ +It is possible, though very improbable, that the lsn's in the tablespace to be +imported have risen above the current system lsn, if a lengthy purge, ibuf +merge, or rollback was performed on a backup taken with ibbackup. If that is +the case, reset page lsn's in the file. We assume that mysqld was shut down +after it performed these cleanup operations on the .ibd file, so that it at +the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the +first page of the .ibd file, and we can determine whether we need to reset the +lsn's just by looking at that flush lsn. */ + +ibool +fil_reset_too_high_lsns( +/*====================*/ + /* out: TRUE if success */ + const char* name, /* in: table name in the + databasename/tablename format */ + dulint current_lsn) /* in: reset lsn's if the lsn stamped + to FIL_PAGE_FILE_FLUSH_LSN in the + first page is too high */ +{ + os_file_t file; + char* filepath; + byte* page; + byte* buf2; + dulint flush_lsn; + ulint space_id; + ib_longlong file_size; + ib_longlong offset; + ulint page_no; + ibool success; + + filepath = fil_make_ibd_name(name); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); + if (!success) { + mem_free(filepath); + + return(FALSE); + } + + /* Read the first page of the tablespace */ + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + + /* We have to read the file flush lsn from the header of the file */ + + flush_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); + + if (ut_dulint_cmp(current_lsn, flush_lsn) >= 0) { + /* Ok */ + success = TRUE; + + goto func_exit; + } + + space_id = fsp_header_get_space_id(page); - ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain)); - ut_ad(0 == UT_LIST_GET_LEN(space->chain)); + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Flush lsn in the tablespace file %lu to be imported\n" +"InnoDB: is %lu %lu, which exceeds current system lsn %lu %lu.\n" +"InnoDB: We reset the lsn's in the file %s.\n", + (ulong) space_id, + (ulong) ut_dulint_get_high(flush_lsn), + (ulong) ut_dulint_get_low(flush_lsn), + (ulong) ut_dulint_get_high(current_lsn), + (ulong) ut_dulint_get_low(current_lsn), filepath); + + /* Loop through all the pages in the tablespace and reset the lsn and + the page checksum if necessary */ + + file_size = os_file_get_size_as_iblonglong(file); + + for (offset = 0; offset < file_size; offset += UNIV_PAGE_SIZE) { + success = os_file_read(file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), UNIV_PAGE_SIZE); + if (!success) { - mutex_exit(&(system->mutex)); + goto func_exit; + } + if (ut_dulint_cmp(mach_read_from_8(page + FIL_PAGE_LSN), + current_lsn) > 0) { + /* We have to reset the lsn */ + space_id = mach_read_from_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + + buf_flush_init_for_writing(page, current_lsn, space_id, + page_no); + success = os_file_write(filepath, file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + } + } - mem_free(space->name); - mem_free(space); + success = os_file_flush(file); + if (!success) { + + goto func_exit; + } + + /* We now update the flush_lsn stamp at the start of the file */ + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + + mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); + + success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + success = os_file_flush(file); +func_exit: + os_file_close(file); + ut_free(buf2); + mem_free(filepath); + + return(success); } +/************************************************************************ +Tries to open a single-table tablespace and checks the space id is right in +it. If does not succeed, prints an error message to the .err log. This +function is used to open the tablespace when we load a table definition +to the dictionary cache. NOTE that we assume this operation is used under the +protection of the dictionary mutex, so that two users cannot race here. This +operation does not leave the file associated with the tablespace open, but +closes it after we have looked at the space id in it. */ + +ibool +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ + ulint id, /* in: space id */ + const char* name) /* in: table name in the + databasename/tablename format */ +{ + os_file_t file; + char* filepath; + ibool success; + byte* buf2; + byte* page; + ulint space_id; + ibool ret = TRUE; + + filepath = fil_make_ibd_name(name); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: trying to open a table, but could not\n" +"InnoDB: open the tablespace file %s!\n", filepath); + fprintf(stderr, +"InnoDB: have you moved InnoDB .ibd files around without using the\n" +"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n" +"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n" +"InnoDB: how to resolve the issue.\n"); + + mem_free(filepath); + + return(FALSE); + } + + /* Read the first page of the tablespace */ + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + + if (space_id != id) { + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: tablespace id in file %s is %lu, but in the InnoDB\n" +"InnoDB: data dictionary it is %lu.\n", filepath, (ulong) space_id, (ulong) id); + fprintf(stderr, +"InnoDB: Have you moved InnoDB .ibd files around without using the\n" +"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n" +"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n" +"InnoDB: how to resolve the issue.\n"); + + ret = FALSE; + + goto func_exit; + } + + success = fil_space_create(filepath, space_id, FIL_TABLESPACE); + + if (!success) { + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(buf2); + mem_free(filepath); + + return(ret); +} + +#ifdef UNIV_HOTBACKUP /*********************************************************************** -Returns the size of the space in pages. */ +Allocates a file name for an old version of a single-table tablespace. +The string must be freed by caller with mem_free()! */ +static +char* +fil_make_ibbackup_old_name( +/*=======================*/ + /* out, own: file name */ + const char* name) /* in: original file name */ +{ + static const char suffix[] = "_ibbackup_old_vers_"; + ulint len = strlen(name); + char* path = mem_alloc(len + (15 + sizeof suffix)); + + memcpy(path, name, len); + memcpy(path + len, suffix, (sizeof suffix) - 1); + ut_sprintf_timestamp_without_extra_chars(path + len + sizeof suffix); + return(path); +} +#endif /* UNIV_HOTBACKUP */ + +/************************************************************************ +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.c data structures. */ +static +void +fil_load_single_table_tablespace( +/*=============================*/ + const char* dbname, /* in: database name */ + const char* filename) /* in: file name (not a path), + including the .ibd extension */ +{ + os_file_t file; + char* filepath; + ibool success; + byte* buf2; + byte* page; + ulint space_id; + ulint size_low; + ulint size_high; + ib_longlong size; +#ifdef UNIV_HOTBACKUP + fil_space_t* space; +#endif + filepath = mem_alloc(strlen(dbname) + strlen(filename) + + strlen(fil_path_to_mysql_datadir) + 3); + + sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname, + filename); + srv_normalize_path_for_win(filepath); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, +"InnoDB: Error: could not open single-table tablespace file\n" +"InnoDB: %s!\n" +"InnoDB: We do not continue crash recovery, because the table will become\n" +"InnoDB: corrupt if we cannot apply the log records in the InnoDB log to it.\n" +"InnoDB: To fix the problem and start mysqld:\n" +"InnoDB: 1) If there is a permission problem in the file and mysqld cannot\n" +"InnoDB: open the file, you should modify the permissions.\n" +"InnoDB: 2) If the table is not needed, or you can restore it from a backup,\n" +"InnoDB: then you can remove the .ibd file, and InnoDB will do a normal\n" +"InnoDB: crash recovery and ignore that table.\n" +"InnoDB: 3) If the file system or the disk is broken, and you cannot remove\n" +"InnoDB: the .ibd file, you can set innodb_force_recovery > 0 in my.cnf\n" +"InnoDB: and force InnoDB to continue crash recovery here.\n", filepath); + + mem_free(filepath); + + if (srv_force_recovery > 0) { + fprintf(stderr, +"InnoDB: innodb_force_recovery was set to %lu. Continuing crash recovery\n" +"InnoDB: even though we cannot access the .ibd file of this table.\n", + srv_force_recovery); + return; + } + + exit(1); + } + + success = os_file_get_size(file, &size_low, &size_high); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, +"InnoDB: Error: could not measure the size of single-table tablespace file\n" +"InnoDB: %s!\n" +"InnoDB: We do not continue crash recovery, because the table will become\n" +"InnoDB: corrupt if we cannot apply the log records in the InnoDB log to it.\n" +"InnoDB: To fix the problem and start mysqld:\n" +"InnoDB: 1) If there is a permission problem in the file and mysqld cannot\n" +"InnoDB: access the file, you should modify the permissions.\n" +"InnoDB: 2) If the table is not needed, or you can restore it from a backup,\n" +"InnoDB: then you can remove the .ibd file, and InnoDB will do a normal\n" +"InnoDB: crash recovery and ignore that table.\n" +"InnoDB: 3) If the file system or the disk is broken, and you cannot remove\n" +"InnoDB: the .ibd file, you can set innodb_force_recovery > 0 in my.cnf\n" +"InnoDB: and force InnoDB to continue crash recovery here.\n", filepath); + + os_file_close(file); + mem_free(filepath); + + if (srv_force_recovery > 0) { + fprintf(stderr, +"InnoDB: innodb_force_recovery was set to %lu. Continuing crash recovery\n" +"InnoDB: even though we cannot access the .ibd file of this table.\n", + srv_force_recovery); + return; + } + + exit(1); + } + + /* TODO: What to do in other cases where we cannot access an .ibd + file during a crash recovery? */ + + /* Every .ibd file is created >= 4 pages in size. Smaller files + cannot be ok. */ + + size = (((ib_longlong)size_high) << 32) + (ib_longlong)size_low; +#ifndef UNIV_HOTBACKUP + if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + fprintf(stderr, +"InnoDB: Error: the size of single-table tablespace file %s\n" +"InnoDB: is only %lu %lu, should be at least %lu!", filepath, + (ulong) size_high, + (ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE)); + os_file_close(file); + mem_free(filepath); + + return; + } +#endif + /* Read the first page of the tablespace if the size big enough */ + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + } else { + space_id = ULINT_UNDEFINED; + } + +#ifndef UNIV_HOTBACKUP + if (space_id == ULINT_UNDEFINED || space_id == 0) { + fprintf(stderr, +"InnoDB: Error: tablespace id %lu in file %s is not sensible\n", + (ulong) space_id, + filepath); + goto func_exit; + } +#else + if (space_id == ULINT_UNDEFINED || space_id == 0) { + char* new_path; + + fprintf(stderr, +"InnoDB: Renaming tablespace %s of id %lu,\n" +"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n" +"InnoDB: because its size %lld is too small (< 4 pages 16 kB each),\n" +"InnoDB: or the space id in the file header is not sensible.\n" +"InnoDB: This can happen in an ibbackup run, and is not dangerous.\n", + filepath, space_id, filepath, size); + os_file_close(file); + + new_path = fil_make_ibbackup_old_name(filepath); + ut_a(os_file_rename(filepath, new_path)); + + ut_free(buf2); + mem_free(filepath); + mem_free(new_path); + + return; + } + + /* A backup may contain the same space several times, if the space got + renamed at a sensitive time. Since it is enough to have one version of + the space, we rename the file if a space with the same space id + already exists in the tablespace memory cache. We rather rename the + file than delete it, because if there is a bug, we do not want to + destroy valuable data. */ + + mutex_enter(&(fil_system->mutex)); + + space = fil_get_space_for_id_low(space_id); + + if (space) { + char* new_path; + + fprintf(stderr, +"InnoDB: Renaming tablespace %s of id %lu,\n" +"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n" +"InnoDB: because space %s with the same id\n" +"InnoDB: was scanned earlier. This can happen if you have renamed tables\n" +"InnoDB: during an ibbackup run.\n", filepath, space_id, filepath, + space->name); + os_file_close(file); + + new_path = fil_make_ibbackup_old_name(filepath); + + mutex_exit(&(fil_system->mutex)); + + ut_a(os_file_rename(filepath, new_path)); + + ut_free(buf2); + mem_free(filepath); + mem_free(new_path); + + return; + } + mutex_exit(&(fil_system->mutex)); +#endif + success = fil_space_create(filepath, space_id, FIL_TABLESPACE); + + if (!success) { + + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(buf2); + mem_free(filepath); +} + +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ ulint -fil_space_get_size( -/*===============*/ - /* out: space size */ - ulint id) /* in: space id */ +fil_load_single_table_tablespaces(void) +/*===================================*/ + /* out: DB_SUCCESS or error number */ { + int ret; + char* dbpath = NULL; + ulint dbpath_len = 100; + os_file_dir_t dir; + os_file_dir_t dbdir; + os_file_stat_t dbinfo; + os_file_stat_t fileinfo; + + /* The datadir of MySQL is always the default directory of mysqld */ + + dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE); + + if (dir == NULL) { + + return(DB_ERROR); + } + + dbpath = mem_alloc(dbpath_len); + + /* Scan all directories under the datadir. They are the database + directories of MySQL. */ + + ret = os_file_readdir_next_file(fil_path_to_mysql_datadir, dir, + &dbinfo); + while (ret == 0) { + ulint len; + /* printf("Looking at %s in datadir\n", dbinfo.name); */ + + if (dbinfo.type == OS_FILE_TYPE_FILE + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + + goto next_datadir_item; + } + + /* We found a symlink or a directory; try opening it to see + if a symlink is a directory */ + + len = strlen(fil_path_to_mysql_datadir) + + strlen (dbinfo.name) + 2; + if (len > dbpath_len) { + dbpath_len = len; + + if (dbpath) { + mem_free(dbpath); + } + + dbpath = mem_alloc(dbpath_len); + } + sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir, + dbinfo.name); + srv_normalize_path_for_win(dbpath); + + dbdir = os_file_opendir(dbpath, FALSE); + + if (dbdir != NULL) { + /* printf("Opened dir %s\n", dbinfo.name); */ + + /* We found a database directory; loop through it, + looking for possible .ibd files in it */ + + ret = os_file_readdir_next_file(dbpath, dbdir, + &fileinfo); + while (ret == 0) { + /* printf( +" Looking at file %s\n", fileinfo.name); */ + + if (fileinfo.type == OS_FILE_TYPE_DIR + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + goto next_file_item; + } + + /* We found a symlink or a file */ + if (strlen(fileinfo.name) > 4 + && 0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd")) { + /* The name ends in .ibd; try opening + the file */ + fil_load_single_table_tablespace( + dbinfo.name, fileinfo.name); + } +next_file_item: + ret = os_file_readdir_next_file(dbpath, dbdir, + &fileinfo); + } + + if (0 != os_file_closedir(dbdir)) { + fprintf(stderr, +"InnoDB: Warning: could not close database directory %s\n", dbpath); + } + } + +next_datadir_item: + ret = os_file_readdir_next_file(fil_path_to_mysql_datadir, + dir, &dbinfo); + } + + mem_free(dbpath); + + /* At the end of directory we should get 1 as the return value, -1 + if there was an error */ + if (ret != 1) { + fprintf(stderr, +"InnoDB: Error: os_file_readdir_next_file returned %d in MySQL datadir\n", + ret); + os_file_closedir(dir); + + return(DB_ERROR); + } + + if (0 != os_file_closedir(dir)) { + fprintf(stderr, +"InnoDB: Error: could not close MySQL datadir\n"); + + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ + +void +fil_print_orphaned_tablespaces(void) +/*================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space) { + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && !space->mark) { + fprintf(stderr, +"InnoDB: Warning: tablespace %s of id %lu has no matching table in\n" +"InnoDB: the InnoDB data dictionary.\n", space->name, (ulong) space->id); + } + + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ + +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_longlong version)/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +{ + fil_system_t* system = fil_system; fil_space_t* space; - fil_system_t* system = fil_system; - ulint size; ut_ad(system); @@ -724,29 +2953,36 @@ fil_space_get_size( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - size = space->size; - + if (space == NULL || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + if (version != ((ib_longlong)-1) + && space->tablespace_version != version) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + mutex_exit(&(system->mutex)); - return(size); + return(FALSE); } /*********************************************************************** -Checks if the pair space, page_no refers to an existing page in a -tablespace file space. */ +Returns TRUE if a single-table tablespace exists in the memory cache. */ ibool -fil_check_adress_in_tablespace( -/*===========================*/ - /* out: TRUE if the address is meaningful */ - ulint id, /* in: space id */ - ulint page_no)/* in: page number */ +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ + ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; - ulint size; - ibool ret; - + fil_space_t* space; + ut_ad(system); mutex_enter(&(system->mutex)); @@ -754,23 +2990,363 @@ fil_check_adress_in_tablespace( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); if (space == NULL) { - ret = FALSE; - } else { - size = space->size; + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ + +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + const char* name, /* in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /* in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist) + /* in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ +{ + fil_system_t* system = fil_system; + fil_space_t* namespace; + fil_space_t* space; + char* path; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + path = fil_make_ibd_name(name); + + /* Look if there is a space with the same id */ + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + /* Look if there is a space with the same name; the name is the + directory path from the datadir to the file */ + + HASH_SEARCH(name_hash, system->name_hash, + ut_fold_string(path), namespace, + 0 == strcmp(namespace->name, path)); + if (space && space == namespace) { + /* Found */ + + if (mark_space) { + space->mark = TRUE; + } + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + if (!print_error_if_does_not_exist) { + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); + } - if (page_no > size) { - ret = FALSE; - } else if (space->purpose != FIL_TABLESPACE) { - ret = FALSE; + if (space == NULL) { + if (namespace == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s\n" +"InnoDB: in InnoDB data dictionary has tablespace id %lu,\n" +"InnoDB: but tablespace with that id or name does not exist. Have\n" +"InnoDB: you deleted or moved .ibd files?\n", + name, (ulong) id); } else { - ret = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s\n" +"InnoDB: in InnoDB data dictionary has tablespace id %lu,\n" +"InnoDB: but tablespace with that id does not exist. There is\n" +"InnoDB: a tablespace of name %s and id %lu, though. Have\n" +"InnoDB: you deleted or moved .ibd files?\n", + name, (ulong) id, namespace->name, + (ulong) namespace->id); } + fprintf(stderr, +"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n" +"InnoDB: how to resolve the issue.\n"); + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (0 != strcmp(space->name, path)) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s\n" +"InnoDB: in InnoDB data dictionary has tablespace id %lu,\n" +"InnoDB: but tablespace with that id has name %s.\n" +"InnoDB: Have you deleted or moved .ibd files?\n", name, (ulong) id, space->name); + + if (namespace != NULL) { + fprintf(stderr, +"InnoDB: There is a tablespace with the right name\n" +"InnoDB: %s, but its id is %lu.\n", namespace->name, (ulong) namespace->id); + } + + fprintf(stderr, +"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n" +"InnoDB: how to resolve the issue.\n"); + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. */ +static +ulint +fil_get_space_id_for_table( +/*=======================*/ + /* out: space id, ULINT_UNDEFINED if not + found */ + const char* name) /* in: table name in the standard + 'databasename/tablename' format */ +{ + fil_system_t* system = fil_system; + fil_space_t* namespace; + ulint id = ULINT_UNDEFINED; + char* path; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + path = fil_make_ibd_name(name); + + /* Look if there is a space with the same name; the name is the + directory path to the file */ + + HASH_SEARCH(name_hash, system->name_hash, + ut_fold_string(path), namespace, + 0 == strcmp(namespace->name, path)); + if (namespace) { + id = namespace->id; + } + + mem_free(path); + + mutex_exit(&(system->mutex)); + + return(id); +} + +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. */ + +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + /* out: TRUE if success */ + ulint* actual_size, /* out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /* in: space id, must be != 0 */ + ulint size_after_extend)/* in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ulint start_page_no; + ulint file_start_page_no; + ulint n_pages; + ulint offset_high; + ulint offset_low; + ibool success = TRUE; + + fil_mutex_enter_and_prepare_for_io(space_id); + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + ut_a(space); + + if (space->size >= size_after_extend) { + /* Space already big enough */ + + *actual_size = space->size; + + mutex_exit(&(system->mutex)); + + return(TRUE); } + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + /* Extend 1 MB at a time */ + + buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + memset(buf, '\0', 1024 * 1024); + + start_page_no = space->size; + file_start_page_no = space->size - node->size; + + while (start_page_no < size_after_extend) { + n_pages = size_after_extend - start_page_no; + + if (n_pages > (1024 * 1024) / UNIV_PAGE_SIZE) { + n_pages = (1024 * 1024) / UNIV_PAGE_SIZE; + } + + offset_high = (start_page_no - file_start_page_no) + / (4096 * ((1024 * 1024) / UNIV_PAGE_SIZE)); + offset_low = ((start_page_no - file_start_page_no) + % (4096 * ((1024 * 1024) / UNIV_PAGE_SIZE))) + * UNIV_PAGE_SIZE; +#ifdef UNIV_HOTBACKUP + success = os_file_write(node->name, node->handle, buf, + offset_low, offset_high, + UNIV_PAGE_SIZE * n_pages); +#else + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + offset_low, offset_high, + UNIV_PAGE_SIZE * n_pages, + NULL, NULL); +#endif + if (success) { + node->size += n_pages; + space->size += n_pages; + + os_has_said_disk_full = FALSE; + } else { + /* Let us measure the size of the file to determine + how much we were able to extend it */ + + n_pages = ((ulint) + (os_file_get_size_as_iblonglong(node->handle) + / UNIV_PAGE_SIZE)) - node->size; + + node->size += n_pages; + space->size += n_pages; + + break; + } + + start_page_no += n_pages; + } + + mem_free(buf2); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + *actual_size = space->size; + /* + printf("Extended %s to %lu, actual size %lu pages\n", space->name, + size_after_extend, *actual_size); */ + mutex_exit(&(system->mutex)); + + fil_flush(space_id); + + return(success); +} + +#ifdef UNIV_HOTBACKUP +/************************************************************************ +Extends all tablespaces to the size stored in the space header. During the +ibbackup --apply-log phase we extended the spaces on-demand so that log records +could be applied, but that may have left spaces still too small compared to +the size stored in the space header. */ + +void +fil_extend_tablespaces_to_stored_len(void) +/*======================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + byte* buf; + ulint actual_size; + ulint size_in_header; + ulint error; + ibool success; + + buf = mem_alloc(UNIV_PAGE_SIZE); + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space) { + ut_a(space->purpose == FIL_TABLESPACE); + + mutex_exit(&(system->mutex)); /* no need to protect with a + mutex, because this is a single- + threaded operation */ + error = fil_read(TRUE, space->id, 0, 0, UNIV_PAGE_SIZE, buf, + NULL); + ut_a(error == DB_SUCCESS); + + size_in_header = fsp_get_size_low(buf); + + success = fil_extend_space_to_desired_size(&actual_size, + space->id, size_in_header); + if (!success) { + fprintf(stderr, +"InnoDB: Error: could not extend the tablespace of %s\n" +"InnoDB: to the size stored in header, %lu pages;\n" +"InnoDB: size after extension %lu pages\n" +"InnoDB: Check that you have free disk space and retry!\n", space->name, + size_in_header, actual_size); + exit(1); + } + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_NEXT(space_list, space); + } + mutex_exit(&(system->mutex)); - return(ret); + mem_free(buf); } +#endif + +/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ /*********************************************************************** Tries to reserve free extents in a file space. */ @@ -783,8 +3359,8 @@ fil_space_reserve_free_extents( ulint n_free_now, /* in: number of free extents now */ ulint n_to_reserve) /* in: how many one wants to reserve */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ibool success; ut_ad(system); @@ -793,6 +3369,8 @@ fil_space_reserve_free_extents( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + if (space->n_reserved_extents + n_to_reserve > n_free_now) { success = FALSE; } else { @@ -814,8 +3392,8 @@ fil_space_release_free_extents( ulint id, /* in: space id */ ulint n_reserved) /* in: how many one reserved */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -823,6 +3401,7 @@ fil_space_release_free_extents( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); ut_a(space->n_reserved_extents >= n_reserved); space->n_reserved_extents -= n_reserved; @@ -839,8 +3418,8 @@ fil_space_get_n_reserved_extents( /*=============================*/ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ulint n; ut_ad(system); @@ -858,208 +3437,99 @@ fil_space_get_n_reserved_extents( return(n); } +/*============================ FILE I/O ================================*/ + /************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + Prepares a file node for i/o. Opens the file if it is closed. Updates the pending i/o's field in the node and the system appropriately. Takes the node -off the LRU list if it is in the LRU list. */ +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ static void fil_node_prepare_for_io( /*====================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system, /* in: file system */ + fil_system_t* system, /* in: tablespace memory cache */ fil_space_t* space) /* in: space */ { - ibool ret; - fil_node_t* last_node; - ut_ad(node && system && space); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(system->mutex))); #endif /* UNIV_SYNC_DEBUG */ + if (system->n_open > system->max_n_open + 5) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: open files %lu exceeds the limit %lu\n", + (ulong) system->n_open, + (ulong) system->max_n_open); + } + if (node->open == FALSE) { - /* File is closed */ + /* File is closed: open it */ ut_a(node->n_pending == 0); - /* If too many files are open, close one */ - - if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU) - == system->max_n_open) { - - ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - - last_node = UT_LIST_GET_LAST(system->LRU); - - if (last_node == NULL) { - fprintf(stderr, - "InnoDB: Error: cannot close any file to open another for i/o\n" - "InnoDB: Pending i/o's on %lu files exist\n", - system->n_open_pending); - - ut_error; - } - - fil_node_close(last_node, system); - } - - if (space->purpose == FIL_LOG) { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_LOG_FILE, &ret); - } else { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); - } - - ut_a(ret); - - node->open = TRUE; - - system->n_open_pending++; - node->n_pending = 1; - - /* File was closed: the node was not in the LRU list */ - - return; + fil_node_open_file(node, system, space); } - /* File is open */ - if (node->n_pending == 0) { + if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE + && space->id != 0) { /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(LRU, system->LRU, node); - - system->n_open_pending++; - node->n_pending = 1; - } else { - /* There is already a pending i/o-op on the file: the node is - not in the LRU list */ + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - node->n_pending++; + UT_LIST_REMOVE(LRU, system->LRU, node); } + + node->n_pending++; } /************************************************************************ Updates the data structures when an i/o operation finishes. Updates the -pending i/os field in the node and the system appropriately. Puts the node -in the LRU list if there are no other pending i/os. */ +pending i/o's field in the node appropriately. */ static void fil_node_complete_io( /*=================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system, /* in: file system */ - ulint type) /* in: OS_FILE_WRITE or ..._READ */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type) /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ { ut_ad(node); ut_ad(system); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(system->mutex))); #endif /* UNIV_SYNC_DEBUG */ + ut_a(node->n_pending > 0); node->n_pending--; - if (type != OS_FILE_READ) { - node->is_modified = TRUE; + if (type == OS_FILE_WRITE) { + system->modification_counter++; + node->modification_counter = system->modification_counter; } - if (node->n_pending == 0) { + if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE + && node->space->id != 0) { /* The node must be put back to the LRU list */ UT_LIST_ADD_FIRST(LRU, system->LRU, node); - - ut_a(system->n_open_pending > 0); - - system->n_open_pending--; - - if (system->n_open_pending == system->max_n_open - 1) { - - os_event_set(system->can_open); - } } } - -/************************************************************************** -Tries to extend a data file by the number of pages given. Any fractions of a -megabyte are ignored. */ - -ibool -fil_extend_last_data_file( -/*======================*/ - /* out: TRUE if success, also if we run - out of disk space we may return TRUE */ - ulint* actual_increase,/* out: number of pages we were able to - extend, here the orginal size of the file and - the resulting size of the file are rounded - downwards to a full megabyte, and the - difference expressed in pages is returned */ - ulint size_increase) /* in: try to extend this many pages */ -{ - fil_node_t* node; - fil_space_t* space; - fil_system_t* system = fil_system; - byte* buf2; - byte* buf; - ibool success; - ulint i; - - mutex_enter(&(system->mutex)); - - HASH_SEARCH(hash, system->spaces, 0, space, space->id == 0); - - ut_a(space); - - node = UT_LIST_GET_LAST(space->chain); - - fil_node_prepare_for_io(node, system, space); - - buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); - buf = ut_align(buf2, UNIV_PAGE_SIZE); - - memset(buf, '\0', 1024 * 1024); - - for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) { - - /* If we use native Windows aio, then also this write is - done using it */ - - success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, - node->name, node->handle, buf, - (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF, - node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), - 1024 * 1024, NULL, NULL); - - if (!success) { - break; - } - - node->size += ((1024 * 1024) / UNIV_PAGE_SIZE); - space->size += ((1024 * 1024) / UNIV_PAGE_SIZE); - - os_has_said_disk_full = FALSE; - } - - mem_free(buf2); - - fil_node_complete_io(node, system, OS_FILE_WRITE); - - mutex_exit(&(system->mutex)); - - *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE); - - fil_flush(0); - - srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase; - - return(TRUE); -} /************************************************************************ Reads or writes data. This operation is asynchronous (aio). */ -void +ulint fil_io( /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, ORed to OS_FILE_LOG, if a log i/o and ORed to OS_AIO_SIMULATED_WAKE_LATER @@ -1084,17 +3554,15 @@ fil_io( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { + fil_system_t* system = fil_system; ulint mode; fil_space_t* space; fil_node_t* node; ulint offset_high; ulint offset_low; - fil_system_t* system; - os_event_t event; ibool ret; ulint is_log; ulint wake_later; - ulint count; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -1105,7 +3573,7 @@ fil_io( ut_ad(byte_offset < UNIV_PAGE_SIZE); ut_ad(buf); ut_ad(len > 0); - ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE); + ut_a((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE); ut_ad(fil_validate()); #ifndef UNIV_LOG_DEBUG /* ibuf bitmap pages must be read in the sync aio mode: */ @@ -1127,82 +3595,47 @@ fil_io( mode = OS_AIO_NORMAL; } - system = fil_system; + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ - count = 0; -loop: - count++; - - /* NOTE that there is a possibility of a hang here: - if the read i/o-handler thread needs to complete - a read by reading from the insert buffer, it may need to - post another read. But if the maximum number of files - are already open, it cannot proceed from here! */ - - mutex_enter(&(system->mutex)); + fil_mutex_enter_and_prepare_for_io(space_id); - if (count < 500 && !is_log && !ibuf_inside() - && system->n_open_pending >= (3 * system->max_n_open) / 4) { - - /* We are not doing an ibuf operation: leave a - safety margin of openable files for possible ibuf - merges needed in page read completion */ - - mutex_exit(&(system->mutex)); - - /* Wake the i/o-handler threads to make sure pending - i/o's are handled and eventually we can open the file */ - - os_aio_simulated_wake_handler_threads(); - - os_thread_sleep(100000); - - if (count > 50) { - fprintf(stderr, - "InnoDB: Warning: waiting for file closes to proceed\n" - "InnoDB: round %lu\n", count); - } - - goto loop; - } - - if (system->n_open_pending == system->max_n_open) { - - /* It is not sure we can open the file if it is closed: wait */ - - event = system->can_open; - os_event_reset(event); - + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (!space) { mutex_exit(&(system->mutex)); - /* Wake the i/o-handler threads to make sure pending - i/o's are handled and eventually we can open the file */ - - os_aio_simulated_wake_handler_threads(); - + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Warning: max allowed number of files is open\n"); - - os_event_wait(event); - - goto loop; - } +" InnoDB: Error: trying to do i/o to a tablespace which does not exist.\n" +"InnoDB: i/o type %lu, space id %lu, page no. %lu, i/o length %lu bytes\n", + (ulong) type, (ulong) space_id, (ulong) block_offset, + (ulong) len); - HASH_SEARCH(hash, system->spaces, space_id, space, - space->id == space_id); - ut_a(space); + return(DB_TABLESPACE_DELETED); + } ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE)); node = UT_LIST_GET_FIRST(space->chain); for (;;) { + if (space->id != 0 && node->size == 0) { + /* We do not know the size of a single-table tablespace + before we open the file */ + + break; + } + if (node == NULL) { fprintf(stderr, - "InnoDB: Error: trying to access page number %lu in space %lu\n" + "InnoDB: Error: trying to access page number %lu in space %lu,\n" + "InnoDB: space name %s,\n" "InnoDB: which is outside the tablespace bounds.\n" "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n", - block_offset, space_id, byte_offset, len, type); + (ulong) block_offset, (ulong) space_id, + space->name, (ulong) byte_offset, (ulong) len, + (ulong) type); ut_error; } @@ -1219,13 +3652,29 @@ loop: /* Open file if closed */ fil_node_prepare_for_io(node, system, space); + /* Check that at least the start offset is within the bounds of a + single-table tablespace */ + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && node->size <= block_offset) { + + fprintf(stderr, + "InnoDB: Error: trying to access page number %lu in space %lu,\n" + "InnoDB: space name %s,\n" + "InnoDB: which is outside the tablespace bounds.\n" + "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n", + (ulong) block_offset, (ulong) space_id, + space->name, (ulong) byte_offset, (ulong) len, + (ulong) type); + ut_a(0); + } + /* Now we have made the changes in the data structures of system */ mutex_exit(&(system->mutex)); /* Calculate the low 32 bits and the high 32 bits of the file offset */ offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT)); - offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF) + offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL) + byte_offset; ut_a(node->size - block_offset >= @@ -1236,9 +3685,20 @@ loop: ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); +#ifdef UNIV_HOTBACKUP + /* In ibbackup do normal i/o, not aio */ + if (type == OS_FILE_READ) { + ret = os_file_read(node->handle, buf, offset_low, offset_high, + len); + } else { + ret = os_file_write(node->name, node->handle, buf, + offset_low, offset_high, len); + } +#else /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, offset_low, offset_high, len, node, message); +#endif ut_a(ret); if (mode == OS_AIO_SYNC) { @@ -1253,6 +3713,8 @@ loop: ut_ad(fil_validate()); } + + return(DB_SUCCESS); } /************************************************************************ @@ -1260,9 +3722,12 @@ Reads data from a space to a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_read( /*=====*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -1276,8 +3741,8 @@ fil_read( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { - fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len, - buf, message); + return(fil_io(OS_FILE_READ, sync, space_id, block_offset, + byte_offset, len, buf, message)); } /************************************************************************ @@ -1285,9 +3750,12 @@ Writes data to a space from a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_write( /*======*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -1301,8 +3769,8 @@ fil_write( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { - fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len, - buf, message); + return(fil_io(OS_FILE_WRITE, sync, space_id, block_offset, + byte_offset, len, buf, message)); } /************************************************************************** @@ -1317,9 +3785,9 @@ fil_aio_wait( ulint segment) /* in: the number of the segment in the aio array to wait for */ { + fil_system_t* system = fil_system; ibool ret; fil_node_t* fil_node; - fil_system_t* system = fil_system; void* message; ulint type; @@ -1328,8 +3796,8 @@ fil_aio_wait( if (os_aio_use_native_aio) { srv_set_io_thread_op_info(segment, "native aio handle"); #ifdef WIN_ASYNC_IO - ret = os_aio_windows_handle(segment, 0, &fil_node, &message, - &type); + ret = os_aio_windows_handle(segment, 0, (void**) &fil_node, + &message, &type); #elif defined(POSIX_ASYNC_IO) ret = os_aio_posix_handle(segment, &fil_node, &message); #else @@ -1356,6 +3824,10 @@ fil_aio_wait( ut_ad(fil_validate()); /* Do the i/o handling */ + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in tablespace 0, you have to be very careful not to introduce + deadlocks in the i/o system. We keep tablespace 0 data files always + open, and use a special i/o thread to serve insert buffer requests. */ if (buf_pool_is_block(message)) { srv_set_io_thread_op_info(segment, "complete io for buf page"); @@ -1367,7 +3839,8 @@ fil_aio_wait( } /************************************************************************** -Flushes to disk possible writes cached by the OS. */ +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ void fil_flush( @@ -1379,42 +3852,81 @@ fil_flush( fil_space_t* space; fil_node_t* node; os_file_t file; + ib_longlong old_mod_counter; mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, space_id, space, - space->id == space_id); - ut_a(space); + space->id == space_id); + if (!space || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return; + } + space->n_pending_flushes++; /* prevent dropping of the space while + we are flushing */ node = UT_LIST_GET_FIRST(space->chain); while (node) { - if (node->open && node->is_modified) { - file = node->handle; + if (node->modification_counter > node->flush_counter) { + ut_a(node->open); + + /* We want to flush the changes at least up to + old_mod_counter */ + old_mod_counter = node->modification_counter; - node->is_modified = FALSE; - if (space->purpose == FIL_TABLESPACE) { fil_n_pending_tablespace_flushes++; } else { fil_n_pending_log_flushes++; } +#ifdef __WIN__ + if (node->is_raw_disk) { - mutex_exit(&(system->mutex)); + goto skip_flush; + } +#endif +retry: + if (node->n_pending_flushes > 0) { + /* We want to avoid calling os_file_flush() on + the file twice at the same time, because we do + not know what bugs OS's may contain in file + i/o; sleep for a while */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + mutex_enter(&(system->mutex)); + + if (node->flush_counter >= old_mod_counter) { + + goto skip_flush; + } - /* Note that it is not certain, when we have - released the mutex above, that the file of the - handle is still open: we assume that the OS - will not crash or trap even if we pass a handle - to a closed file below in os_file_flush! */ + goto retry; + } + + ut_a(node->open); + file = node->handle; + node->n_pending_flushes++; + + mutex_exit(&(system->mutex)); /* fprintf(stderr, "Flushing to file %s\n", node->name); */ - os_file_flush(file); - + os_file_flush(file); + mutex_enter(&(system->mutex)); + node->n_pending_flushes--; +skip_flush: + if (node->flush_counter < old_mod_counter) { + node->flush_counter = old_mod_counter; + } + if (space->purpose == FIL_TABLESPACE) { fil_n_pending_tablespace_flushes--; } else { @@ -1425,11 +3937,13 @@ fil_flush( node = UT_LIST_GET_NEXT(chain, node); } + space->n_pending_flushes--; + mutex_exit(&(system->mutex)); } /************************************************************************** -Flushes to disk writes in file spaces of the given type possibly cached by +Flushes to disk the writes in file spaces of the given type possibly cached by the OS. */ void @@ -1446,13 +3960,17 @@ fil_flush_file_spaces( while (space) { if (space->purpose == purpose) { + space->n_pending_flushes++; /* prevent dropping of the + space while we are + flushing */ mutex_exit(&(system->mutex)); fil_flush(space->id); mutex_enter(&(system->mutex)); - } + space->n_pending_flushes--; + } space = UT_LIST_GET_NEXT(space_list, space); } @@ -1460,20 +3978,18 @@ fil_flush_file_spaces( } /********************************************************************** -Checks the consistency of the file system. */ +Checks the consistency of the tablespace cache. */ ibool fil_validate(void) /*==============*/ /* out: TRUE if ok */ { + fil_system_t* system = fil_system; fil_space_t* space; fil_node_t* fil_node; - ulint pending_count = 0; - fil_system_t* system; + ulint n_open = 0; ulint i; - - system = fil_system; mutex_enter(&(system->mutex)); @@ -1484,36 +4000,35 @@ fil_validate(void) space = HASH_GET_FIRST(system->spaces, i); while (space != NULL) { - UT_LIST_VALIDATE(chain, fil_node_t, space->chain); fil_node = UT_LIST_GET_FIRST(space->chain); while (fil_node != NULL) { - if (fil_node->n_pending > 0) { - - pending_count++; ut_a(fil_node->open); } + if (fil_node->open) { + n_open++; + } fil_node = UT_LIST_GET_NEXT(chain, fil_node); } - space = HASH_GET_NEXT(hash, space); } } - ut_a(pending_count == system->n_open_pending); + ut_a(system->n_open == n_open); UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU); fil_node = UT_LIST_GET_FIRST(system->LRU); while (fil_node != NULL) { - ut_a(fil_node->n_pending == 0); ut_a(fil_node->open); + ut_a(fil_node->space->purpose == FIL_TABLESPACE); + ut_a(fil_node->space->id != 0); fil_node = UT_LIST_GET_NEXT(LRU, fil_node); } @@ -1581,4 +4096,4 @@ fil_page_get_type( ut_ad(page); return(mach_read_from_2(page + FIL_PAGE_TYPE)); -} +} diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index 53f5e885df8..34b6de76ff4 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -27,6 +27,10 @@ Created 11/29/1995 Heikki Tuuri #include "dict0mem.h" #include "log0log.h" + +#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header + within a file page */ + /* The data structures in files are defined just as byte strings in C */ typedef byte fsp_header_t; typedef byte xdes_t; @@ -38,10 +42,9 @@ File space header data structure: this data structure is contained in the first page of a space. The space for this header is reserved in every extent descriptor page, but used only in the first. */ -#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header - within a file page */ /*-------------------------------------*/ -#define FSP_NOT_USED 0 /* this field contained a value up to +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to which we know that the modifications in the database have been flushed to the file space; not used now */ @@ -50,7 +53,13 @@ descriptor page, but used only in the first. */ #define FSP_FREE_LIMIT 12 /* Minimum page number for which the free list has not been initialized: the pages >= this limit are, by - definition, free */ + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocted those pages to the + file */ #define FSP_LOWEST_NO_WRITE 16 /* The lowest page offset for which the page has not been written to disk (if it has been written, we know that @@ -83,7 +92,6 @@ descriptor page, but used only in the first. */ #define FSP_FREE_ADD 4 /* this many free extents are added to the free list from above FSP_FREE_LIMIT at a time */ - /* FILE SEGMENT INODE ================== @@ -263,9 +271,14 @@ static void fsp_fill_free_list( /*===============*/ - ulint space, /* in: space */ - fsp_header_t* header, /* in: space header */ - mtr_t* mtr); /* in: mtr */ + ibool init_space, /* in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr); /* in: mtr */ /************************************************************************** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -286,6 +299,19 @@ fseg_alloc_free_page_low( FSP_UP, FSP_NO_DIR */ mtr_t* mtr); /* in: mtr handle */ + +/************************************************************************** +Reads the file space size stored in the header page. */ + +ulint +fsp_get_size_low( +/*=============*/ + /* out: tablespace size stored in the space header */ + page_t* page) /* in: header page (page 0 in the tablespace) */ +{ + return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE)); +} + /************************************************************************** Gets a pointer to the space header and x-locks its page. */ UNIV_INLINE @@ -569,7 +595,7 @@ xdes_init( ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0); for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) { - mlog_write_ulint(descr + i, 0xFFFFFFFF, MLOG_4BYTES, mtr); + mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr); } xdes_set_state(descr, XDES_FREE, mtr); @@ -630,8 +656,8 @@ xdes_get_descriptor_with_space_hdr( page_t* descr_page; ut_ad(mtr); - ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK)); - + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), + MTR_MEMO_X_LOCK)); /* Read free limit and space size */ limit = mtr_read_ulint(sp_header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); size = mtr_read_ulint(sp_header + FSP_SIZE, MLOG_4BYTES, mtr); @@ -646,7 +672,7 @@ xdes_get_descriptor_with_space_hdr( /* If offset is == limit, fill free list of the space. */ if (offset == limit) { - fsp_fill_free_list(space, sp_header, mtr); + fsp_fill_free_list(FALSE, space, sp_header, mtr); } descr_page_no = xdes_calc_descriptor_page(offset); @@ -714,8 +740,8 @@ xdes_lst_get_descriptor( xdes_t* descr; ut_ad(mtr); - ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK)); - + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), + MTR_MEMO_X_LOCK)); descr = fut_get_ptr(space, lst_node, RW_X_LATCH, mtr) - XDES_FLST_NODE; return(descr); @@ -825,8 +851,21 @@ fsp_init(void) } /************************************************************************** +Writes the space id to a tablespace header. This function is used past the +buffer pool when we in fil0fil.c create a new single-table tablespace. */ + +void +fsp_header_write_space_id( +/*======================*/ + page_t* page, /* in: first page in the space */ + ulint space_id) /* in: space id */ +{ + mach_write_to_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID, space_id); +} + +/************************************************************************** Initializes the space header of a new created space and creates also the -insert buffer tree root. */ +insert buffer tree root if space == 0. */ void fsp_header_init( @@ -843,9 +882,6 @@ fsp_header_init( mtr_x_lock(fil_space_get_latch(space), mtr); page = buf_page_create(space, 0, mtr); -#ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level(page, SYNC_FSP_PAGE); -#endif /* UNIV_SYNC_DEBUG */ buf_page_get(space, 0, RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_FSP_PAGE); @@ -857,6 +893,8 @@ fsp_header_init( header = FSP_HEADER_OFFSET + page; + mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr); mlog_write_ulint(header + FSP_LOWEST_NO_WRITE, 0, MLOG_4BYTES, mtr); @@ -869,10 +907,40 @@ fsp_header_init( flst_init(header + FSP_SEG_INODES_FREE, mtr); mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), mtr); - fsp_fill_free_list(space, header, mtr); - - btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, + if (space == 0) { + fsp_fill_free_list(FALSE, space, header, mtr); + btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, ut_dulint_add(DICT_IBUF_ID_MIN, space), mtr); + } else { + fsp_fill_free_list(TRUE, space, header, mtr); + } +} + +/************************************************************************** +Reads the space id from the first page of a tablespace. */ + +ulint +fsp_header_get_space_id( +/*====================*/ + /* out: space id, ULINT UNDEFINED if error */ + page_t* page) /* in: first page of a tablespace */ +{ + ulint fsp_id; + ulint id; + + fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID); + + id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (id != fsp_id) { + fprintf(stderr, +"InnoDB: Error: space id in fsp header %lu, but in the page header %lu\n", + (ulong) fsp_id, + (ulong) id); + return(ULINT_UNDEFINED); + } + + return(id); } /************************************************************************** @@ -896,7 +964,8 @@ fsp_header_inc_size( size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, + mtr); } /************************************************************************** @@ -909,7 +978,7 @@ ulint fsp_header_get_free_limit( /*======================*/ /* out: free limit in megabytes */ - ulint space) /* in: space id */ + ulint space) /* in: space id, must be 0 */ { fsp_header_t* header; ulint limit; @@ -943,7 +1012,7 @@ ulint fsp_header_get_tablespace_size( /*===========================*/ /* out: size in pages */ - ulint space) /* in: space id */ + ulint space) /* in: space id, must be 0 */ { fsp_header_t* header; ulint size; @@ -965,40 +1034,80 @@ fsp_header_get_tablespace_size( } /*************************************************************************** -Tries to extend the last data file file if it is defined as auto-extending. */ +Tries to extend a single-table tablespace so that a page would fit in the +data file. */ static ibool -fsp_try_extend_last_file( +fsp_try_extend_data_file_with_pages( +/*================================*/ + /* out: TRUE if success */ + ulint space, /* in: space */ + ulint page_no, /* in: page number */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ +{ + ibool success; + ulint actual_size; + ulint size; + + ut_a(space != 0); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + ut_a(page_no >= size); + + success = fil_extend_space_to_desired_size(&actual_size, space, + page_no + 1); + /* actual_size now has the space size in pages; it may be less than + we wanted if we ran out of disk space */ + + mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr); + + return(success); +} + +/*************************************************************************** +Tries to extend the last data file of a tablespace if it is auto-extending. */ +static +ibool +fsp_try_extend_data_file( /*=====================*/ /* out: FALSE if not auto-extending */ - ulint* actual_increase,/* out: actual increase in pages */ + ulint* actual_increase,/* out: actual increase in pages, where + we measure the tablespace size from + what the header field says; it may be + the actual file size rounded down to + megabyte */ ulint space, /* in: space */ fsp_header_t* header, /* in: space header */ mtr_t* mtr) /* in: mtr */ { ulint size; + ulint new_size; + ulint old_size; ulint size_increase; + ulint actual_size; ibool success; - ut_a(space == 0); - *actual_increase = 0; - if (!srv_auto_extend_last_data_file) { + if (space == 0 && !srv_auto_extend_last_data_file) { return(FALSE); } size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); - if (srv_last_file_size_max != 0) { + old_size = size; + + if (space == 0 && srv_last_file_size_max != 0) { if (srv_last_file_size_max < srv_data_file_sizes[srv_n_data_files - 1]) { fprintf(stderr, "InnoDB: Error: Last data file size is %lu, max size allowed %lu\n", - srv_data_file_sizes[srv_n_data_files - 1], - srv_last_file_size_max); + (ulong) srv_data_file_sizes[srv_n_data_files - 1], + (ulong) srv_last_file_size_max); } size_increase = srv_last_file_size_max @@ -1007,24 +1116,58 @@ fsp_try_extend_last_file( size_increase = SRV_AUTO_EXTEND_INCREMENT; } } else { - size_increase = SRV_AUTO_EXTEND_INCREMENT; + if (space == 0) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } else { + /* We extend single-table tablespaces first one extent + at a time, but for bigger tablespaces more. It is not + enough to extend always by one extent, because some + extents are frag page extents. */ + + if (size < FSP_EXTENT_SIZE) { + /* Let us first extend the file to 64 pages */ + success = fsp_try_extend_data_file_with_pages( + space, FSP_EXTENT_SIZE - 1, + header, mtr); + if (!success) { + new_size = mtr_read_ulint( + header + FSP_SIZE, MLOG_4BYTES, mtr); + + *actual_increase = new_size - old_size; + + return(FALSE); + } + + size = FSP_EXTENT_SIZE; + } + + if (size < 32 * FSP_EXTENT_SIZE) { + size_increase = FSP_EXTENT_SIZE; + } else { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + size_increase = FSP_FREE_ADD * FSP_EXTENT_SIZE; + } + } } if (size_increase == 0) { + return(TRUE); } - /* Extend the data file. If we are not able to extend - the full requested length, the function tells us - the number of full megabytes (but the unit is pages!) - we were able to extend. */ - - success = fil_extend_last_data_file(actual_increase, size_increase); + success = fil_extend_space_to_desired_size(&actual_size, space, + size + size_increase); + /* We ignore any fragments of a full megabyte when storing the size + to the space header */ - if (success) { - mlog_write_ulint(header + FSP_SIZE, size + *actual_increase, + mlog_write_ulint(header + FSP_SIZE, + ut_calc_align_down(actual_size, (1024 * 1024) / UNIV_PAGE_SIZE), MLOG_4BYTES, mtr); - } + new_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + *actual_increase = new_size - old_size; return(TRUE); } @@ -1037,9 +1180,14 @@ static void fsp_fill_free_list( /*===============*/ - ulint space, /* in: space */ - fsp_header_t* header, /* in: space header */ - mtr_t* mtr) /* in: mtr */ + ibool init_space, /* in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ { ulint limit; ulint size; @@ -1058,27 +1206,37 @@ fsp_fill_free_list( size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); - if (srv_auto_extend_last_data_file + if (space == 0 && srv_auto_extend_last_data_file && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { /* Try to increase the last data file size */ - fsp_try_extend_last_file(&actual_increase, space, header, - mtr); + fsp_try_extend_data_file(&actual_increase, space, header, mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + + if (space != 0 && !init_space + && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + + /* Try to increase the .ibd file size */ + fsp_try_extend_data_file(&actual_increase, space, header, mtr); size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); } i = limit; - while ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD)) { + while ((init_space && i < 1) + || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) { mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE, MLOG_4BYTES, mtr); /* Update the free limit info in the log system and make a checkpoint */ - log_fsp_current_free_limit_set_and_checkpoint( + if (space == 0) { + log_fsp_current_free_limit_set_and_checkpoint( (i + FSP_EXTENT_SIZE) / ((1024 * 1024) / UNIV_PAGE_SIZE)); + } if (0 == i % XDES_DESCRIBED_PER_PAGE) { @@ -1088,10 +1246,6 @@ fsp_fill_free_list( if (i > 0) { descr_page = buf_page_create(space, i, mtr); -#ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level(descr_page, - SYNC_FSP_PAGE); -#endif /* UNIV_SYNC_DEBUG */ buf_page_get(space, i, RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(descr_page, @@ -1100,7 +1254,7 @@ fsp_fill_free_list( fsp_init_file_page(descr_page, mtr); } - /* Initialize the ibuf page in a separate + /* Initialize the ibuf bitmap page in a separate mini-transaction because it is low in the latching order, and we must be able to release its latch before returning from the fsp routine */ @@ -1109,9 +1263,6 @@ fsp_fill_free_list( ibuf_page = buf_page_create(space, i + FSP_IBUF_BITMAP_OFFSET, &ibuf_mtr); -#ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level(ibuf_page, SYNC_IBUF_BITMAP); -#endif /* UNIV_SYNC_DEBUG */ buf_page_get(space, i + FSP_IBUF_BITMAP_OFFSET, RW_X_LATCH, &ibuf_mtr); #ifdef UNIV_SYNC_DEBUG @@ -1188,7 +1339,7 @@ fsp_alloc_free_extent( first = flst_get_first(header + FSP_FREE, mtr); if (fil_addr_is_null(first)) { - fsp_fill_free_list(space, header, mtr); + fsp_fill_free_list(FALSE, space, header, mtr); first = flst_get_first(header + FSP_FREE, mtr); } @@ -1225,6 +1376,8 @@ fsp_alloc_free_page( ulint free; ulint frag_n_used; ulint page_no; + ulint space_size; + ibool success; ut_ad(mtr); @@ -1278,6 +1431,30 @@ fsp_alloc_free_page( ut_error; } + page_no = xdes_get_offset(descr) + free; + + space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (space_size <= page_no) { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + + ut_a(space != 0); + if (page_no >= FSP_EXTENT_SIZE) { + fprintf(stderr, +"InnoDB: Error: trying to extend a single-table tablespace %lu\n" +"InnoDB: by single page(s) though the space size %lu. Page no %lu.\n", + (ulong) space, (ulong) space_size, (ulong) page_no); + return(FIL_NULL); + } + success = fsp_try_extend_data_file_with_pages(space, page_no, + header, mtr); + if (!success) { + /* No disk space left */ + return(FIL_NULL); + } + } + xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); /* Update the FRAG_N_USED field */ @@ -1299,8 +1476,6 @@ fsp_alloc_free_page( mtr); } - page_no = xdes_get_offset(descr) + free; - /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ @@ -1346,7 +1521,8 @@ fsp_free_page( if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) { fprintf(stderr, "InnoDB: Error: File space extent descriptor of page %lu has state %lu\n", - page, state); + (ulong) page, + (ulong) state); fputs("InnoDB: Dump of descriptor: ", stderr); ut_print_buf(stderr, ((byte*)descr) - 50, 200); putc('\n', stderr); @@ -1364,7 +1540,7 @@ fsp_free_page( if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) { fprintf(stderr, "InnoDB: Error: File space extent descriptor of page %lu says it is free\n" -"InnoDB: Dump of descriptor: ", page); +"InnoDB: Dump of descriptor: ", (ulong) page); ut_print_buf(stderr, ((byte*)descr) - 50, 200); putc('\n', stderr); @@ -1599,8 +1775,8 @@ fsp_alloc_seg_inode( inode = fsp_seg_inode_page_get_nth_inode(page, n, mtr); - if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, mtr)) { - + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, + mtr)) { /* There are no other unused headers left on the page: move it to another list */ @@ -1654,7 +1830,7 @@ fsp_free_seg_inode( flst_remove(space_header + FSP_SEG_INODES_FREE, page + FSEG_INODE_PAGE_NODE, mtr); - fsp_free_page(space, buf_frame_get_page_no(page), mtr); + fsp_free_page(space, buf_frame_get_page_no(page), mtr); } } @@ -1818,12 +1994,12 @@ fseg_create_general( will belong to the created segment */ ulint byte_offset, /* in: byte offset of the created segment header on the page */ - ibool has_done_reservation, /* in: TRUE if the caller has - already done the reservation for the pages - with fsp_reserve_free_extents (at least 2 extents: - one for the inode and, then there other for the - segment) is no need to do the check for this - individual operation */ + ibool has_done_reservation, /* in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ mtr_t* mtr) /* in: mtr */ { fsp_header_t* space_header; @@ -1832,6 +2008,7 @@ fseg_create_general( fseg_header_t* header = 0; /* remove warning */ rw_lock_t* latch; ibool success; + ulint n_reserved; page_t* ret = NULL; ulint i; @@ -1855,12 +2032,14 @@ fseg_create_general( /* This thread did not own the latch before this call: free excess pages from the insert buffer free list */ - ibuf_free_excess_pages(space); + if (space == 0) { + ibuf_free_excess_pages(space); + } } if (!has_done_reservation) { - success = fsp_reserve_free_extents(space, 2, FSP_NORMAL, mtr); - + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); if (!success) { return(NULL); } @@ -1923,7 +2102,7 @@ fseg_create_general( funct_exit: if (!has_done_reservation) { - fil_space_release_free_extents(space, 2); + fil_space_release_free_extents(space, n_reserved); } return(ret); @@ -2141,6 +2320,8 @@ fseg_alloc_free_page_low( FSP_UP, FSP_NO_DIR */ mtr_t* mtr) /* in: mtr handle */ { + fsp_header_t* space_header; + ulint space_size; dulint seg_id; ulint used; ulint reserved; @@ -2151,6 +2332,7 @@ fseg_alloc_free_page_low( xdes_t* ret_descr; /* the extent of the allocated page */ page_t* page; ibool frag_page_allocated = FALSE; + ibool success; ulint n; ut_ad(mtr); @@ -2163,8 +2345,10 @@ fseg_alloc_free_page_low( reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr); - descr = xdes_get_descriptor(space, hint, mtr); + space_header = fsp_get_space_header(space, mtr); + descr = xdes_get_descriptor_with_space_hdr(space_header, space, + hint, mtr); if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ @@ -2294,8 +2478,32 @@ fseg_alloc_free_page_low( return(FIL_NULL); } - if (!frag_page_allocated) { + if (space != 0) { + space_size = fil_space_get_size(space); + + if (space_size <= ret_page) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + if (ret_page >= FSP_EXTENT_SIZE) { + fprintf(stderr, +"InnoDB: Error (2): trying to extend a single-table tablespace %lu\n" +"InnoDB: by single page(s) though the space size %lu. Page no %lu.\n", + (ulong) space, (ulong) space_size, + (ulong) ret_page); + return(FIL_NULL); + } + + success = fsp_try_extend_data_file_with_pages(space, + ret_page, space_header, mtr); + if (!success) { + /* No disk space left */ + return(FIL_NULL); + } + } + } + + if (!frag_page_allocated) { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ @@ -2356,6 +2564,7 @@ fseg_alloc_free_page_general( rw_lock_t* latch; ibool success; ulint page_no; + ulint n_reserved; space = buf_frame_get_space_id(seg_header); @@ -2372,14 +2581,16 @@ fseg_alloc_free_page_general( /* This thread did not own the latch before this call: free excess pages from the insert buffer free list */ - ibuf_free_excess_pages(space); + if (space == 0) { + ibuf_free_excess_pages(space); + } } inode = fseg_inode_get(seg_header, mtr); if (!has_done_reservation) { - success = fsp_reserve_free_extents(space, 2, FSP_NORMAL, mtr); - + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); if (!success) { return(FIL_NULL); } @@ -2388,7 +2599,7 @@ fseg_alloc_free_page_general( page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode), inode, hint, direction, mtr); if (!has_done_reservation) { - fil_space_release_free_extents(space, 2); + fil_space_release_free_extents(space, n_reserved); } return(page_no); @@ -2418,6 +2629,46 @@ fseg_alloc_free_page( } /************************************************************************** +Checks that we have at least 2 frag pages free in the first extent of a +single-table tablespace, and they are also physically initialized to the data +file. That is we have already extended the data file so that those pages are +inside the data file. If not, this function extends the tablespace with +pages. */ +static +ibool +fsp_reserve_free_pages( +/*===================*/ + /* out: TRUE if there were >= 3 free + pages, or we were able to extend */ + ulint space, /* in: space id, must be != 0 */ + fsp_header_t* space_header, /* in: header of that space, + x-latched */ + ulint size, /* in: size of the tablespace in pages, + must be < FSP_EXTENT_SIZE / 2 */ + mtr_t* mtr) /* in: mtr */ +{ + xdes_t* descr; + ulint n_used; + + ut_a(space != 0); + ut_a(size < FSP_EXTENT_SIZE / 2); + + descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0, + mtr); + n_used = xdes_get_n_used(descr, mtr); + + ut_a(n_used <= size); + + if (size >= n_used + 2) { + + return(TRUE); + } + + return(fsp_try_extend_data_file_with_pages(space, n_used + 1, + space_header, mtr)); +} + +/************************************************************************** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand and reserve enough free extents so that they certainly will be able @@ -2435,12 +2686,21 @@ two types of allocation: when space is scarce, FSP_NORMAL allocations will not succeed, but the latter two allocations will succeed, if possible. The purpose is to avoid dead end where the database is full but the user cannot free any space because these freeing operations temporarily -reserve some space. */ +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. */ ibool fsp_reserve_free_extents( /*=====================*/ /* out: TRUE if we were able to make the reservation */ + ulint* n_reserved,/* out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ ulint space, /* in: space id */ ulint n_ext, /* in: number of extents to reserve */ ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ @@ -2463,6 +2723,8 @@ fsp_reserve_free_extents( || mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK)); #endif /* UNIV_SYNC_DEBUG */ + *n_reserved = n_ext; + latch = fil_space_get_latch(space); mtr_x_lock(latch, mtr); @@ -2471,6 +2733,12 @@ fsp_reserve_free_extents( try_again: size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr); + if (size < FSP_EXTENT_SIZE / 2) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return(fsp_reserve_free_pages(space, space_header, size, mtr)); + } + n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr); free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, @@ -2520,7 +2788,7 @@ try_again: return(TRUE); } try_to_extend: - success = fsp_try_extend_last_file(&n_pages_added, space, + success = fsp_try_extend_data_file(&n_pages_added, space, space_header, mtr); if (success && n_pages_added > 0) { @@ -2571,6 +2839,13 @@ fsp_get_available_space_in_free_extents( MLOG_4BYTES, &mtr); mtr_commit(&mtr); + if (size < FSP_EXTENT_SIZE) { + ut_a(space != 0); /* This must be a single-table + tablespace */ + return(0); /* TODO: count free frag pages and return + a value based on that */ + } + /* Below we play safe when counting free extents above the free limit: some of them will contain extent descriptor pages, and therefore will not be free extents */ @@ -2668,13 +2943,9 @@ fseg_free_page_low( xdes_t* descr; ulint not_full_n_used; ulint state; + dulint descr_id; + dulint seg_id; ulint i; - -#ifdef __WIN__ - dulint desm; - dulint segm; -#endif - ut_ad(seg_inode && mtr); ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == @@ -2698,7 +2969,7 @@ fseg_free_page_low( "InnoDB: though it is already marked as free in the tablespace!\n" "InnoDB: The tablespace free space info is corrupt.\n" "InnoDB: You may need to dump your InnoDB tables and recreate the whole\n" -"InnoDB: database!\n", page); +"InnoDB: database!\n", (ulong) page); crash: fputs( "InnoDB: If the InnoDB recovery crashes here, see section 6.1\n" @@ -2727,26 +2998,22 @@ fseg_free_page_low( return; } + /* If we get here, the page is in some extent of the segment */ + + descr_id = mtr_read_dulint(descr + XDES_ID, mtr); + seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr); /* fprintf(stderr, "InnoDB: InnoDB is freeing space %lu page %lu,\n" "InnoDB: which belongs to descr seg %lu %lu\n" "InnoDB: segment %lu %lu.\n", space, page, - ut_dulint_get_high( - mtr_read_dulint(descr + XDES_ID, mtr)), - ut_dulint_get_low( - mtr_read_dulint(descr + XDES_ID, mtr)), - ut_dulint_get_high( - mtr_read_dulint(seg_inode + FSEG_ID, mtr)), - ut_dulint_get_low( - mtr_read_dulint(seg_inode + FSEG_ID, mtr))); + ut_dulint_get_high(descr_id), + ut_dulint_get_low(descr_id), + ut_dulint_get_high(seg_id), + ut_dulint_get_low(seg_id)); */ - /* If we get here, the page is in some extent of the segment */ - if (0 != ut_dulint_cmp( - mtr_read_dulint(descr + XDES_ID, mtr), - mtr_read_dulint(seg_inode + FSEG_ID, mtr))) { - + if (0 != ut_dulint_cmp(descr_id, seg_id)) { fputs("InnoDB: Dump of the tablespace extent descriptor: ", stderr); ut_print_buf(stderr, descr, 40); @@ -2754,42 +3021,15 @@ fseg_free_page_low( ut_print_buf(stderr, seg_inode, 40); putc('\n', stderr); - -#ifndef __WIN__ - - fprintf(stderr, -"InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n" -"InnoDB: which does not belong to segment %lu %lu but belongs\n" -"InnoDB: to segment %lu %lu.\n", - space, page, - ut_dulint_get_high( - mtr_read_dulint(descr + XDES_ID, mtr)), - ut_dulint_get_low( - mtr_read_dulint(descr + XDES_ID, mtr)), - ut_dulint_get_high( - mtr_read_dulint(seg_inode + FSEG_ID, mtr)), - ut_dulint_get_low( - mtr_read_dulint(seg_inode + FSEG_ID, mtr))); - -#else - -/* More pedantic usage to avoid VC++ 6.0 compiler errors due to inline - function expansion issues */ - - desm = mtr_read_dulint(descr + XDES_ID, mtr); - segm = mtr_read_dulint(seg_inode + FSEG_ID, mtr); - - fprintf(stderr, + fprintf(stderr, "InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n" "InnoDB: which does not belong to segment %lu %lu but belongs\n" "InnoDB: to segment %lu %lu.\n", - space, page, - ut_dulint_get_high(desm), - ut_dulint_get_low(desm), - ut_dulint_get_high(segm), - ut_dulint_get_low(segm)); - -#endif + (ulong) space, (ulong) page, + (ulong) ut_dulint_get_high(descr_id), + (ulong) ut_dulint_get_low(descr_id), + (ulong) ut_dulint_get_high(seg_id), + (ulong) ut_dulint_get_low(seg_id)); goto crash; } @@ -3295,7 +3535,7 @@ fseg_print_low( seg_id_low = ut_dulint_get_low(d_var); seg_id_high = ut_dulint_get_high(d_var); - + n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr); n_frag = fseg_get_n_frag_pages(inode, mtr); @@ -3306,9 +3546,10 @@ fseg_print_low( fprintf(stderr, "SEGMENT id %lu %lu space %lu; page %lu; res %lu used %lu; full ext %lu\n" "fragm pages %lu; free extents %lu; not full extents %lu: pages %lu\n", - seg_id_high, seg_id_low, space, page_no, reserved, used, - n_full, - n_frag, n_free, n_not_full, n_used); + (ulong) seg_id_high, (ulong) seg_id_low, (ulong) space, (ulong) page_no, + (ulong) reserved, (ulong) used, (ulong) n_full, + (ulong) n_frag, (ulong) n_free, (ulong) n_not_full, + (ulong) n_used); } /*********************************************************************** @@ -3379,7 +3620,7 @@ fsp_validate( n_full_frag_pages = FSP_EXTENT_SIZE * flst_get_len(header + FSP_FULL_FRAG, &mtr); - ut_a(free_limit <= size); + ut_a(free_limit <= size || (space != 0 && size < FSP_EXTENT_SIZE)); flst_validate(header + FSP_FREE, &mtr); flst_validate(header + FSP_FREE_FRAG, &mtr); @@ -3616,10 +3857,10 @@ fsp_print( "size %lu, free limit %lu, free extents %lu\n" "not full frag extents %lu: used pages %lu, full frag extents %lu\n" "first seg id not used %lu %lu\n", - space, - size, free_limit, n_free, - n_free_frag, frag_n_used, n_full_frag, - seg_id_high, seg_id_low); + (long) space, + (ulong) size, (ulong) free_limit, (ulong) n_free, + (ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag, + (ulong) seg_id_high, (ulong) seg_id_low); mtr_commit(&mtr); @@ -3698,5 +3939,5 @@ fsp_print( mtr_commit(&mtr2); - fprintf(stderr, "NUMBER of file segments: %lu\n", n_segs); + fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs); } diff --git a/innobase/fut/fut0lst.c b/innobase/fut/fut0lst.c index ff112b586c4..8deaa8adb3f 100644 --- a/innobase/fut/fut0lst.c +++ b/innobase/fut/fut0lst.c @@ -512,6 +512,7 @@ flst_print( fprintf(stderr, "FILE-BASED LIST:\n" "Base node in space %lu page %lu byte offset %lu; len %lu\n", - buf_frame_get_space_id(frame), buf_frame_get_page_no(frame), - (ulint) (base - frame), len); + (ulong) buf_frame_get_space_id(frame), + (ulong) buf_frame_get_page_no(frame), + (ulong) (base - frame), (ulong) len); } diff --git a/innobase/ha/ha0ha.c b/innobase/ha/ha0ha.c index c7c4fd46dc8..ad1391ff83e 100644 --- a/innobase/ha/ha0ha.c +++ b/innobase/ha/ha0ha.c @@ -34,6 +34,12 @@ ha_create( table = hash_create(n); + if (in_btr_search) { + table->adaptive = TRUE; + } else { + table->adaptive = FALSE; + } + if (n_mutexes == 0) { if (in_btr_search) { table->heap = mem_heap_create_in_btr_search(4096); @@ -79,6 +85,7 @@ ha_insert_for_fold( hash_cell_t* cell; ha_node_t* node; ha_node_t* prev_node; + buf_block_t* prev_block; ulint hash; ut_ad(table && data); @@ -93,6 +100,12 @@ ha_insert_for_fold( while (prev_node != NULL) { if (prev_node->fold == fold) { + if (table->adaptive) { + prev_block = buf_block_align(prev_node->data); + ut_a(prev_block->n_pointers > 0); + prev_block->n_pointers--; + buf_block_align(data)->n_pointers++; + } prev_node->data = data; @@ -116,6 +129,11 @@ ha_insert_for_fold( } ha_node_set_data(node, data); + + if (table->adaptive) { + buf_block_align(data)->n_pointers++; + } + node->fold = fold; node->next = NULL; @@ -148,6 +166,11 @@ ha_delete_hash_node( hash_table_t* table, /* in: hash table */ ha_node_t* del_node) /* in: node to be deleted */ { + if (table->adaptive) { + ut_a(buf_block_align(del_node->data)->n_pointers > 0); + buf_block_align(del_node->data)->n_pointers--; + } + HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node); } @@ -174,6 +197,37 @@ ha_delete( ha_delete_hash_node(table, node); } +/************************************************************* +Looks for an element when we know the pointer to the data, and updates +the pointer to data, if found. */ + +void +ha_search_and_update_if_found( +/*==========================*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of the searched data */ + void* data, /* in: pointer to the data */ + void* new_data)/* in: new pointer to the data */ +{ + ha_node_t* node; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); +#endif /* UNIV_SYNC_DEBUG */ + + node = ha_search_with_data(table, fold, data); + + if (node) { + if (table->adaptive) { + ut_a(buf_block_align(node->data)->n_pointers > 0); + buf_block_align(node->data)->n_pointers--; + buf_block_align(new_data)->n_pointers++; + } + + node->data = new_data; + } +} + /********************************************************************* Removes from the chain determined by fold all nodes whose data pointer points to the page given. */ @@ -208,7 +262,7 @@ ha_remove_all_nodes_to_page( node = ha_chain_get_next(node); } } - +#ifdef UNIV_DEBUG /* Check that all nodes really got deleted */ node = ha_chain_get_first(table, fold); @@ -218,6 +272,7 @@ ha_remove_all_nodes_to_page( node = ha_chain_get_next(node); } +#endif } /***************************************************************** @@ -246,7 +301,7 @@ ha_validate( fprintf(stderr, "InnoDB: Error: hash table node fold value %lu does not\n" "InnoDB: match with the cell number %lu.\n", - node->fold, i); + (ulong) node->fold, (ulong) i); ok = FALSE; } @@ -284,7 +339,7 @@ ha_print_info( fprintf(file, "Hash table size %lu, used cells %lu", - hash_get_n_cells(table), cells); + (ulong) hash_get_n_cells(table), (ulong) cells); if (table->heaps == NULL && table->heap != NULL) { @@ -297,6 +352,6 @@ ha_print_info( n_bufs++; } - fprintf(file, ", node heap has %lu buffer(s)\n", n_bufs); + fprintf(file, ", node heap has %lu buffer(s)\n", (ulong) n_bufs); } } diff --git a/innobase/ha/hash0hash.c b/innobase/ha/hash0hash.c index 1f7edf9d7d2..facdea66198 100644 --- a/innobase/ha/hash0hash.c +++ b/innobase/ha/hash0hash.c @@ -91,6 +91,7 @@ hash_create( array = ut_malloc(sizeof(hash_cell_t) * prime); + table->adaptive = FALSE; table->array = array; table->n_cells = prime; table->n_mutexes = 0; diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index 92ec6760c3a..f05e69863a3 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -29,6 +29,35 @@ Created 7/19/1997 Heikki Tuuri #include "log0recv.h" #include "que0que.h" +/* STRUCTURE OF AN INSERT BUFFER RECORD + +In versions < 4.1.x: + +1. The first field is the page number. +2. The second field is an array which stores type info for each subsequent + field. We store the information which affects the ordering of records, and + also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it + is 10 bytes. +3. Next we have the fields of the actual index record. + +In versions >= 4.1.x: + +Note that contary to what we planned in the 1990's, there will only be one +insert buffer tree, and that is in the system tablespace of InnoDB. + +1. The first field is the space id. +2. The second field is a one-byte marker which differentiates records from + the < 4.1.x storage format. +3. The third field is the page number. +4. The fourth field contains the type info, where we have also added 2 bytes to + store the charset. In the compressed table format of 5.0.x we must add more + information here so that we can build a dummy 'index' struct which 5.0.x + can use in the binary search on the index page in the ibuf merge phase. +5. The rest of the fields contain the fields of the actual index record. + +*/ + + /* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM If an OS thread performs any operation that brings in disk pages from @@ -45,20 +74,20 @@ because they own x-latches to pages which are on a lower level than the insert buffer tree latch, its page latches, and the tablespace latch an insert buffer operation can reserve. -The solution is the following: We put into each tablespace an insert buffer -of its own. Let all the tree and page latches connected with the insert buffer -be later in the latching order than the fsp latch and fsp page latches. +The solution is the following: Let all the tree and page latches connected +with the insert buffer be later in the latching order than the fsp latch and +fsp page latches. + Insert buffer pages must be such that the insert buffer is never invoked when these pages are accessed as this would result in a recursion violating the latching order. We let a special i/o-handler thread take care of i/o to the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap pages and the first inode page, which contains the inode of the ibuf tree: let -us call all these ibuf pages. If the OS does not support asynchronous i/o, -then there is no special i/o thread, but to prevent deadlocks, we do not let a -read-ahead access both non-ibuf and ibuf pages. +us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead +access both non-ibuf and ibuf pages. -Then an i/o-handler for the insert buffer never needs to access the insert -buffer tree and thus obeys the latching order. On the other hand, other +Then an i/o-handler for the insert buffer never needs to access recursively the +insert buffer tree and thus obeys the latching order. On the other hand, other i/o-handlers for other tablespaces may require access to the insert buffer, but because all kinds of latches they need to access there are later in the latching order, no violation of the latching order occurs in this case, @@ -95,8 +124,8 @@ the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e., -it uses synchronous aio or the OS does not support aio, it can access any -pages, as long as it obeys the access order rules. */ +it uses synchronous aio, it can access any pages, as long as it obeys the +access order rules. */ /* Buffer pool size per the maximum insert buffer size */ #define IBUF_POOL_SIZE_PER_MAX_SIZE 2 @@ -110,8 +139,8 @@ ulint ibuf_rnd = 986058871; ulint ibuf_flush_count = 0; /* Dimensions for the ibuf_count array */ -#define IBUF_COUNT_N_SPACES 10 -#define IBUF_COUNT_N_PAGES 10000 +#define IBUF_COUNT_N_SPACES 500 +#define IBUF_COUNT_N_PAGES 2000 /* Buffered entry counts for file pages, used in debugging */ static ulint* ibuf_counts[IBUF_COUNT_N_SPACES]; @@ -239,6 +268,8 @@ ibuf_header_page_get( { page_t* page; + ut_a(space == 0); + ut_ad(!ibuf_inside()); page = buf_page_get(space, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr); @@ -263,6 +294,7 @@ ibuf_tree_root_get( { page_t* page; + ut_a(space == 0); ut_ad(ibuf_inside()); mtr_x_lock(dict_tree_get_lock((data->index)->tree), mtr); @@ -275,7 +307,7 @@ ibuf_tree_root_get( return(page); } - + /********************************************************************** Gets the ibuf count for a given page. */ @@ -309,17 +341,17 @@ ibuf_count_set( ulint page_no,/* in: page number */ ulint val) /* in: value to set */ { - ut_ad(space < IBUF_COUNT_N_SPACES); - ut_ad(page_no < IBUF_COUNT_N_PAGES); - ut_ad(val < UNIV_PAGE_SIZE); + ut_a(space < IBUF_COUNT_N_SPACES); + ut_a(page_no < IBUF_COUNT_N_PAGES); + ut_a(val < UNIV_PAGE_SIZE); *(ibuf_counts[space] + page_no) = val; } #endif /********************************************************************** -Creates the insert buffer data structure at a database startup and -initializes the data structures for the insert buffer of each tablespace. */ +Creates the insert buffer data structure at a database startup and initializes +the data structures for the insert buffer. */ void ibuf_init_at_db_start(void) @@ -411,19 +443,19 @@ ibuf_data_sizes_update( /* fprintf(stderr, "ibuf size %lu, space ibuf size %lu\n", ibuf->size, data->size); */ -} +} /********************************************************************** Creates the insert buffer data struct for a single tablespace. Reads the root page of the insert buffer tree in the tablespace. This function can be called only after the dictionary system has been initialized, as this -creates also the insert buffer table and index for this tablespace. */ +creates also the insert buffer table and index into this tablespace. */ ibuf_data_t* ibuf_data_init_for_space( /*=====================*/ /* out, own: ibuf data struct, linked to the list - in ibuf control structure. */ + in ibuf control structure */ ulint space) /* in: space id */ { ibuf_data_t* data; @@ -435,6 +467,8 @@ ibuf_data_init_for_space( dict_index_t* index; ulint n_used; + ut_a(space == 0); + #ifdef UNIV_LOG_DEBUG if (space % 2 == 1) { @@ -475,29 +509,37 @@ ibuf_data_init_for_space( data->n_merged_recs = 0; ibuf_data_sizes_update(data, root, &mtr); - +/* + if (!data->empty) { + fprintf(stderr, +"InnoDB: index entries found in the insert buffer\n"); + } else { + fprintf(stderr, +"InnoDB: insert buffer empty\n"); + } +*/ mutex_exit(&ibuf_mutex); mtr_commit(&mtr); ibuf_exit(); - sprintf(buf, "SYS_IBUF_TABLE_%lu", space); + sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space); table = dict_mem_table_create(buf, space, 2); - dict_mem_table_add_col(table,(char *) "PAGE_NO", DATA_BINARY, 0, 0, 0); - dict_mem_table_add_col(table,(char *) "TYPES", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "PAGE_NO", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "TYPES", DATA_BINARY, 0, 0, 0); table->id = ut_dulint_add(DICT_IBUF_ID_MIN, space); dict_table_add_to_cache(table); - index = dict_mem_index_create(buf, (char *) "CLUST_IND", space, + index = dict_mem_index_create(buf, "CLUST_IND", space, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,2); - dict_mem_index_add_field(index, (char *) "PAGE_NO", 0, 0); - dict_mem_index_add_field(index, (char *) "TYPES", 0, 0); + dict_mem_index_add_field(index, "PAGE_NO", 0, 0); + dict_mem_index_add_field(index, "TYPES", 0, 0); index->page_no = FSP_IBUF_TREE_ROOT_PAGE_NO; @@ -688,7 +730,7 @@ ibuf_bitmap_get_map_page( mtr_t* mtr) /* in: mtr */ { page_t* page; - + page = buf_page_get(space, ibuf_bitmap_page_no_calc(page_no), RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG @@ -902,7 +944,7 @@ UNIV_INLINE ibool ibuf_fixed_addr_page( /*=================*/ - /* out: TRUE if a fixed address ibuf i/o page */ + /* out: TRUE if a fixed address ibuf i/o page */ ulint page_no)/* in: page number */ { if ((ibuf_bitmap_page(page_no)) @@ -939,6 +981,12 @@ ibuf_page( return(TRUE); } + if (space != 0) { + /* Currently we only have an ibuf tree in space 0 */ + + return(FALSE); + } + ut_ad(fil_space_get_type(space) == FIL_TABLESPACE); mtr_start(&mtr); @@ -1003,14 +1051,60 @@ ibuf_rec_get_page_no( ut_ad(ibuf_inside()); ut_ad(rec_get_n_fields(rec) > 2); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field(rec, 1, &len); + + if (len == 1) { + /* This is of the >= 4.1.x record format */ + ut_a(trx_sys_multiple_tablespace_format); + + field = rec_get_nth_field(rec, 2, &len); + } else { + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + field = rec_get_nth_field(rec, 0, &len); + } - ut_ad(len == 4); + ut_a(len == 4); return(mach_read_from_4(field)); } /************************************************************************ +Returns the space id field of an ibuf record. For < 4.1.x format records +returns 0. */ +static +ulint +ibuf_rec_get_space( +/*===============*/ + /* out: space id */ + rec_t* rec) /* in: ibuf record */ +{ + byte* field; + ulint len; + + ut_ad(ibuf_inside()); + ut_ad(rec_get_n_fields(rec) > 2); + + field = rec_get_nth_field(rec, 1, &len); + + if (len == 1) { + /* This is of the >= 4.1.x record format */ + + ut_a(trx_sys_multiple_tablespace_format); + field = rec_get_nth_field(rec, 0, &len); + ut_a(len == 4); + + return(mach_read_from_4(field)); + } + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + return(0); +} + +/************************************************************************ Returns the space taken by a stored non-clustered index entry if converted to an index record. */ static @@ -1022,6 +1116,7 @@ ibuf_rec_get_volume( rec_t* ibuf_rec)/* in: ibuf record */ { dtype_t dtype; + ibool new_format = FALSE; ulint data_size = 0; ulint n_fields; byte* types; @@ -1032,17 +1127,42 @@ ibuf_rec_get_volume( ut_ad(ibuf_inside()); ut_ad(rec_get_n_fields(ibuf_rec) > 2); - n_fields = rec_get_n_fields(ibuf_rec) - 2; + data = rec_get_nth_field(ibuf_rec, 1, &len); + + if (len > 1) { + /* < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); - types = rec_get_nth_field(ibuf_rec, 1, &len); + n_fields = rec_get_n_fields(ibuf_rec) - 2; - ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + types = rec_get_nth_field(ibuf_rec, 1, &len); + + ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } else { + /* >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + new_format = TRUE; + + n_fields = rec_get_n_fields(ibuf_rec) - 4; + + types = rec_get_nth_field(ibuf_rec, 3, &len); + } for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + if (new_format) { + data = rec_get_nth_field(ibuf_rec, i + 4, &len); - dtype_read_for_order_and_null_size(&dtype, + dtype_new_read_for_order_and_null_size(&dtype, + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + } else { + data = rec_get_nth_field(ibuf_rec, i + 2, &len); + + dtype_read_for_order_and_null_size(&dtype, types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } if (len == UNIV_SQL_NULL) { data_size += dtype_get_sql_null_size(&dtype); @@ -1067,6 +1187,7 @@ ibuf_entry_build( must be kept because we copy pointers to its fields */ dtuple_t* entry, /* in: entry for a non-clustered index */ + ulint space, /* in: space id */ ulint page_no,/* in: index page number where entry should be inserted */ mem_heap_t* heap) /* in: heap into which to build */ @@ -1079,49 +1200,79 @@ ibuf_entry_build( byte* buf2; ulint i; - /* We have to build a tuple whose first field is the page number, - the second field contains the original type information for entry, - and the rest of the fields are copied from entry. All fields - in the tuple are of the type binary. */ + /* Starting from 4.1.x, we have to build a tuple whose + (1) first field is the space id, + (2) the second field a single marker byte to tell that this + is a new format record, + (3) the third contains the page number, and + (4) the fourth contains the relevent type information of each data + field, + (5) and the rest of the fields are copied from entry. All fields + in the tuple are ordered like the type binary in our insert buffer + tree. */ n_fields = dtuple_get_n_fields(entry); - tuple = dtuple_create(heap, n_fields + 2); + tuple = dtuple_create(heap, n_fields + 4); - /* Store the page number in tuple */ + /* Store the space id in tuple */ field = dtuple_get_nth_field(tuple, 0); buf = mem_heap_alloc(heap, 4); - mach_write_to_4(buf, page_no); + mach_write_to_4(buf, space); dfield_set_data(field, buf, 4); - /* Store the type info in tuple */ + /* Store the marker byte field in tuple */ - buf2 = mem_heap_alloc(heap, n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + field = dtuple_get_nth_field(tuple, 1); - for (i = 0; i < n_fields; i++) { + buf = mem_heap_alloc(heap, 1); - field = dtuple_get_nth_field(tuple, i + 2); + /* We set the marker byte zero */ - entry_field = dtuple_get_nth_field(entry, i); + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + field = dtuple_get_nth_field(tuple, 2); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + /* Store the type info in buf2, and add the fields from entry to + tuple */ + buf2 = mem_heap_alloc(heap, n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + for (i = 0; i < n_fields; i++) { + /* We add 4 below because we have the 4 extra fields at the + start of an ibuf record */ + + field = dtuple_get_nth_field(tuple, i + 4); + entry_field = dtuple_get_nth_field(entry, i); dfield_copy(field, entry_field); - dtype_store_for_order_and_null_size( - buf2 + i * DATA_ORDER_NULL_TYPE_BUF_SIZE, + dtype_new_store_for_order_and_null_size( + buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, dfield_get_type(entry_field)); } - field = dtuple_get_nth_field(tuple, 1); + /* Store the type info in buf2 to field 3 of tuple */ - dfield_set_data(field, buf2, n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + field = dtuple_get_nth_field(tuple, 3); - /* Set the types in the new tuple binary */ + dfield_set_data(field, buf2, n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + /* Set all the types in the new tuple binary */ - dtuple_set_types_binary(tuple, n_fields + 2); + dtuple_set_types_binary(tuple, n_fields + 4); return(tuple); } @@ -1150,35 +1301,73 @@ ibuf_build_entry_from_ibuf_rec( ulint len; ulint i; - n_fields = rec_get_n_fields(ibuf_rec) - 2; + data = rec_get_nth_field(ibuf_rec, 1, &len); + + if (len > 1) { + /* This a < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields(ibuf_rec) - 2; + tuple = dtuple_create(heap, n_fields); + types = rec_get_nth_field(ibuf_rec, 1, &len); + + ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field(ibuf_rec, i + 2, &len); + + dfield_set_data(field, data, len); + + dtype_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + return(tuple); + } + + /* This a >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + + ut_a(rec_get_n_fields(ibuf_rec) > 4); + + n_fields = rec_get_n_fields(ibuf_rec) - 4; tuple = dtuple_create(heap, n_fields); - types = rec_get_nth_field(ibuf_rec, 1, &len); + types = rec_get_nth_field(ibuf_rec, 3, &len); - ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); + field = dtuple_get_nth_field(tuple, i); - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + data = rec_get_nth_field(ibuf_rec, i + 4, &len); dfield_set_data(field, data, len); - dtype_read_for_order_and_null_size(dfield_get_type(field), - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); } return(tuple); } /************************************************************************* -Builds a search tuple used to search buffered inserts for an index page. */ +Builds a search tuple used to search buffered inserts for an index page. +This is for < 4.1.x format records */ static dtuple_t* ibuf_search_tuple_build( /*====================*/ /* out, own: search tuple */ + ulint space, /* in: space id */ ulint page_no,/* in: index page number */ mem_heap_t* heap) /* in: heap into which to build */ { @@ -1186,6 +1375,10 @@ ibuf_search_tuple_build( dfield_t* field; byte* buf; + ut_a(space == 0); + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + tuple = dtuple_create(heap, 1); /* Store the page number in tuple */ @@ -1204,6 +1397,61 @@ ibuf_search_tuple_build( } /************************************************************************* +Builds a search tuple used to search buffered inserts for an index page. +This is for >= 4.1.x format records. */ +static +dtuple_t* +ibuf_new_search_tuple_build( +/*========================*/ + /* out, own: search tuple */ + ulint space, /* in: space id */ + ulint page_no,/* in: index page number */ + mem_heap_t* heap) /* in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + ut_a(trx_sys_multiple_tablespace_format); + + tuple = dtuple_create(heap, 3); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the new format record marker byte */ + + field = dtuple_get_nth_field(tuple, 1); + + buf = mem_heap_alloc(heap, 1); + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, 2); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, 3); + + return(tuple); +} + +/************************************************************************* Checks if there are enough pages in the free list of the ibuf tree that we dare to start a pessimistic insert to the insert buffer. */ UNIV_INLINE @@ -1272,6 +1520,8 @@ ibuf_add_free_page( page_t* root; page_t* bitmap_page; + ut_a(space == 0); + mtr_start(&mtr); /* Acquire the fsp latch before the ibuf header, obeying the latching @@ -1317,7 +1567,7 @@ ibuf_add_free_page( page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST); - + ibuf_data->seg_size++; ibuf_data->free_list_len++; @@ -1328,7 +1578,6 @@ ibuf_add_free_page( ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF, TRUE, &mtr); - mtr_commit(&mtr); mutex_exit(&ibuf_mutex); @@ -1355,6 +1604,8 @@ ibuf_remove_free_page( page_t* root; page_t* bitmap_page; + ut_a(space == 0); + mtr_start(&mtr); /* Acquire the fsp latch before the ibuf header, obeying the latching @@ -1466,6 +1717,13 @@ ibuf_free_excess_pages( { ibuf_data_t* ibuf_data; ulint i; + + if (space != 0) { + fprintf(stderr, +"InnoDB: Error: calling ibuf_free_excess_pages for space %lu\n", (ulong) space); + return; + } + #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(fil_space_get_latch(space), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ @@ -1521,8 +1779,12 @@ ibuf_get_merge_page_nos( contract the tree, FALSE if this is called when a single page becomes full and we look if it pays to read also nearby pages */ - rec_t* first_rec,/* in: record from which we read down and - up in the chain of records */ + rec_t* first_rec,/* in: record from which we read up and down + in the chain of records */ + ulint* space_ids,/* in/out: space id's of the pages */ + ib_longlong* space_versions,/* in/out: tablespace version + timestamps; used to prevent reading in old + pages after DISCARD + IMPORT tablespace */ ulint* page_nos,/* in/out: buffer for at least IBUF_MAX_N_PAGES_MERGED many page numbers; the page numbers are in an ascending order */ @@ -1530,8 +1792,11 @@ ibuf_get_merge_page_nos( page_nos in this function */ { ulint prev_page_no; + ulint prev_space_id; ulint first_page_no; + ulint first_space_id; ulint rec_page_no; + ulint rec_space_id; rec_t* rec; ulint sum_volumes; ulint volume_for_page; @@ -1563,49 +1828,70 @@ ibuf_get_merge_page_nos( rec = first_rec; first_page_no = ibuf_rec_get_page_no(first_rec); + first_space_id = ibuf_rec_get_space(first_rec); n_pages = 0; prev_page_no = 0; + prev_space_id = 0; + /* Go backwards from the first_rec until we reach the border of the + 'merge area', or the page start or the limit of storeable pages is + reached */ + while ((rec != page_get_infimum_rec(page)) && (n_pages < limit)) { rec_page_no = ibuf_rec_get_page_no(rec); + rec_space_id = ibuf_rec_get_space(rec); - ut_ad(rec_page_no != 0); - - if (rec_page_no / IBUF_MERGE_AREA - != first_page_no / IBUF_MERGE_AREA) { + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { break; } - if (rec_page_no != prev_page_no) { + if (rec_page_no != prev_page_no + || rec_space_id != prev_space_id) { n_pages++; } prev_page_no = rec_page_no; + prev_space_id = rec_space_id; rec = page_rec_get_prev(rec); } rec = page_rec_get_next(rec); + /* At the loop start there is no prev page; we mark this with a pair + of space id, page no (0, 0) for which there can never be entries in + the insert buffer */ + prev_page_no = 0; + prev_space_id = 0; sum_volumes = 0; volume_for_page = 0; while (*n_stored < limit) { if (rec == page_get_supremum_rec(page)) { + /* When no more records available, mark this with + another 'impossible' pair of space id, page no */ rec_page_no = 1; + rec_space_id = 0; } else { rec_page_no = ibuf_rec_get_page_no(rec); + rec_space_id = ibuf_rec_get_space(rec); ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO); } #ifdef UNIV_IBUF_DEBUG ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED); #endif - if (rec_page_no != prev_page_no) { - if ((prev_page_no == first_page_no) + if ((rec_space_id != prev_space_id + || rec_page_no != prev_page_no) + && (prev_space_id != 0 || prev_page_no != 0)) { + + if ((prev_page_no == first_page_no + && prev_space_id == first_space_id) || contract || (volume_for_page > ((IBUF_MERGE_THRESHOLD - 1) @@ -1613,6 +1899,10 @@ ibuf_get_merge_page_nos( / IBUF_PAGE_SIZE_PER_FREE_SPACE) / IBUF_MERGE_THRESHOLD)) { + space_ids[*n_stored] = prev_space_id; + space_versions[*n_stored] + = fil_space_get_version( + prev_space_id); page_nos[*n_stored] = prev_page_no; (*n_stored)++; @@ -1620,8 +1910,9 @@ ibuf_get_merge_page_nos( sum_volumes += volume_for_page; } - if (rec_page_no / IBUF_MERGE_AREA - != first_page_no / IBUF_MERGE_AREA) { + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { break; } @@ -1629,7 +1920,7 @@ ibuf_get_merge_page_nos( volume_for_page = 0; } - if (rec_page_no == 1) { + if (rec_page_no == 1 && rec_space_id == 0) { /* Supremum record */ break; @@ -1640,6 +1931,7 @@ ibuf_get_merge_page_nos( volume_for_page += rec_volume; prev_page_no = rec_page_no; + prev_space_id = rec_space_id; rec = page_rec_get_next(rec); } @@ -1672,6 +1964,8 @@ ibuf_contract_ext( ulint space; ibool all_trees_empty; ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED]; ulint n_stored; ulint sum_sizes; mtr_t mtr; @@ -1684,7 +1978,8 @@ loop: ut_ad(ibuf_validate_low()); - /* Choose an ibuf tree at random */ + /* Choose an ibuf tree at random (though there really is only one tree + in the current implementation) */ ibuf_rnd += 865558671; rnd_pos = ibuf_rnd % ibuf->size; @@ -1720,8 +2015,10 @@ loop: ut_ad(data); - space = (data->index)->space; + space = data->index->space; + ut_a(space == 0); /* We currently only have an ibuf tree in + space 0 */ mtr_start(&mtr); ibuf_enter(); @@ -1750,8 +2047,8 @@ loop: mutex_exit(&ibuf_mutex); sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), - page_nos, &n_stored); - + space_ids, space_versions, page_nos, + &n_stored); #ifdef UNIV_IBUF_DEBUG /* fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", sync, n_stored, sum_sizes); */ @@ -1761,8 +2058,8 @@ loop: mtr_commit(&mtr); btr_pcur_close(&pcur); - buf_read_ibuf_merge_pages(sync, space, page_nos, n_stored); - + buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos, + n_stored); *n_pages = n_stored; return(sum_sizes + 1); @@ -1891,6 +2188,8 @@ ibuf_get_volume_buffered( ulint next_page_no; page_t* next_page; + ut_a(trx_sys_multiple_tablespace_format); + ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) || (pcur->latch_mode == BTR_MODIFY_TREE)); @@ -1913,7 +2212,8 @@ ibuf_get_volume_buffered( break; } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { goto count_later; } @@ -1932,7 +2232,7 @@ ibuf_get_volume_buffered( goto count_later; } - prev_page = buf_page_get(space, prev_page_no, RW_X_LATCH, mtr); + prev_page = buf_page_get(0, prev_page_no, RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(prev_page, SYNC_TREE_NODE); @@ -1951,7 +2251,8 @@ ibuf_get_volume_buffered( return(UNIV_PAGE_SIZE); } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { goto count_later; } @@ -1974,7 +2275,8 @@ count_later: break; } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { return(volume); } @@ -1993,7 +2295,7 @@ count_later: return(volume); } - next_page = buf_page_get(space, next_page_no, RW_X_LATCH, mtr); + next_page = buf_page_get(0, next_page_no, RW_X_LATCH, mtr); #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(next_page, SYNC_TREE_NODE); @@ -2010,7 +2312,8 @@ count_later: return(UNIV_PAGE_SIZE); } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { return(volume); } @@ -2022,6 +2325,57 @@ count_later: } /************************************************************************* +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ + +void +ibuf_update_max_tablespace_id(void) +/*===============================*/ +{ + ulint max_space_id; + rec_t* rec; + byte* field; + ulint len; + ibuf_data_t* ibuf_data; + dict_index_t* ibuf_index; + btr_pcur_t pcur; + mtr_t mtr; + + ibuf_data = fil_space_get_ibuf_data(0); + + ibuf_index = ibuf_data->index; + + ibuf_enter(); + + mtr_start(&mtr); + + btr_pcur_open_at_index_side(FALSE, ibuf_index, BTR_SEARCH_LEAF, + &pcur, TRUE, &mtr); + btr_pcur_move_to_prev(&pcur, &mtr); + + if (btr_pcur_is_before_first_on_page(&pcur, &mtr)) { + /* The tree is empty */ + + max_space_id = 0; + } else { + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field(rec, 0, &len); + + ut_a(len == 4); + + max_space_id = mach_read_from_4(field); + } + + mtr_commit(&mtr); + ibuf_exit(); + + /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ + + fil_set_max_space_id_if_bigger(max_space_id); +} + +/************************************************************************* Makes an index insert to the insert buffer, instead of directly to the disk page, if this is possible. */ static @@ -2041,8 +2395,6 @@ ibuf_insert_low( ulint entry_size; btr_pcur_t pcur; btr_cur_t* cursor; - mtr_t mtr; - mtr_t bitmap_mtr; dtuple_t* ibuf_entry; mem_heap_t* heap; ulint buffered; @@ -2054,16 +2406,25 @@ ibuf_insert_low( page_t* root; ulint err; ibool do_merge; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED]; ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; ulint n_stored; ulint bits; + mtr_t mtr; + mtr_t bitmap_mtr; ut_a(!(index->type & DICT_CLUSTERED)); ut_ad(dtuple_check_typed(entry)); + ut_a(trx_sys_multiple_tablespace_format); + do_merge = FALSE; - - ibuf_data = fil_space_get_ibuf_data(space); + + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ + + ibuf_data = fil_space_get_ibuf_data(0); ibuf_index = ibuf_data->index; @@ -2090,7 +2451,7 @@ ibuf_insert_low( mutex_enter(&ibuf_pessimistic_insert_mutex); ibuf_enter(); - + mutex_enter(&ibuf_mutex); while (!ibuf_data_enough_free_for_insert(ibuf_data)) { @@ -2101,7 +2462,7 @@ ibuf_insert_low( mutex_exit(&ibuf_pessimistic_insert_mutex); - err = ibuf_add_free_page(space, ibuf_data); + err = ibuf_add_free_page(0, ibuf_data); if (err == DB_STRONG_FAIL) { @@ -2126,7 +2487,7 @@ ibuf_insert_low( the first fields and the type information for other fields, and which will be inserted to the insert buffer. */ - ibuf_entry = ibuf_entry_build(entry, page_no, heap); + ibuf_entry = ibuf_entry_build(entry, space, page_no, heap); /* Open a cursor to the insert buffer tree to calculate if we can add the new entry to it without exceeding the free space limit for the @@ -2151,7 +2512,6 @@ ibuf_insert_low( if (buf_page_peek(space, page_no) || lock_rec_expl_exist_on_page(space, page_no)) { - err = DB_STRONG_FAIL; mtr_commit(&bitmap_mtr); @@ -2164,7 +2524,6 @@ ibuf_insert_low( if (buffered + entry_size + page_dir_calc_reserved_space(1) > ibuf_index_page_calc_free_from_bits(bits)) { - mtr_commit(&bitmap_mtr); /* It may not fit */ @@ -2173,7 +2532,8 @@ ibuf_insert_low( do_merge = TRUE; ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), - page_nos, &n_stored); + space_ids, space_versions, page_nos, + &n_stored); goto function_exit; } @@ -2209,10 +2569,10 @@ ibuf_insert_low( which would cause the x-latching of the root after that to break the latching order. */ - root = ibuf_tree_root_get(ibuf_data, space, &mtr); + root = ibuf_tree_root_get(ibuf_data, 0, &mtr); err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG - | BTR_NO_UNDO_LOG_FLAG, + | BTR_NO_UNDO_LOG_FLAG, cursor, ibuf_entry, &ins_rec, &dummy_big_rec, thr, @@ -2229,6 +2589,10 @@ ibuf_insert_low( function_exit: #ifdef UNIV_IBUF_DEBUG if (err == DB_SUCCESS) { + printf( +"Incrementing ibuf count of space %lu page %lu\n" +"from %lu by 1\n", space, page_no, ibuf_count_get(space, page_no)); + ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) + 1); } @@ -2263,7 +2627,8 @@ function_exit: #ifdef UNIV_IBUF_DEBUG ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED); #endif - buf_read_ibuf_merge_pages(FALSE, space, page_nos, n_stored); + buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions, + page_nos, n_stored); } return(err); @@ -2286,6 +2651,7 @@ ibuf_insert( { ulint err; + ut_a(trx_sys_multiple_tablespace_format); ut_ad(dtuple_check_typed(entry)); ut_a(!(index->type & DICT_CLUSTERED)); @@ -2337,6 +2703,26 @@ ibuf_insert_to_index_page( ut_ad(ibuf_inside()); ut_ad(dtuple_check_typed(entry)); + if (rec_get_n_fields(page_rec_get_next(page_get_infimum_rec(page))) + != dtuple_get_n_fields(entry)) { + + fprintf(stderr, +"InnoDB: Trying to insert a record from the insert buffer to an index page\n" +"InnoDB: but the number of fields does not match!\n"); + + buf_page_print(page); + + dtuple_print(stderr, entry); + + fputs( +"InnoDB: The table where where this index record belongs\n" +"InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" +"InnoDB: your tables.\n" +"InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n", stderr); + + return; + } + low_match = page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); if (low_match == dtuple_get_n_fields(entry)) { @@ -2360,8 +2746,8 @@ ibuf_insert_to_index_page( fprintf(stderr, "InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n", - page_get_max_insert_size(page, 1), - rec_get_converted_size(entry)); + (ulong) page_get_max_insert_size(page, 1), + (ulong) rec_get_converted_size(entry)); fputs("InnoDB: Cannot insert index record ", stderr); dtuple_print(stderr, entry); @@ -2374,22 +2760,20 @@ ibuf_insert_to_index_page( buf_frame_get_space_id(page), buf_frame_get_page_no(page), mtr); - old_bits = ibuf_bitmap_page_get_bits( bitmap_page, buf_frame_get_page_no(page), IBUF_BITMAP_FREE, mtr); - fprintf(stderr, "Bitmap bits %lu\n", old_bits); + fprintf(stderr, "Bitmap bits %lu\n", (ulong) old_bits); fputs( "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); - } } } } - + /************************************************************************* Deletes from ibuf the record on which pcur is positioned. If we have to resort to a pessimistic delete, this function commits mtr and closes @@ -2413,13 +2797,16 @@ ibuf_delete_rec( ibuf_data_t* ibuf_data; page_t* root; ulint err; - + ut_ad(ibuf_inside()); success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr); if (success) { #ifdef UNIV_IBUF_DEBUG + printf( +"Decrementing ibuf count of space %lu page %lu\n" +"from %lu by 1\n", space, page_no, ibuf_count_get(space, page_no)); ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1); #endif @@ -2431,7 +2818,10 @@ ibuf_delete_rec( btr_pcur_commit_specify_mtr(pcur, mtr); - ibuf_data = fil_space_get_ibuf_data(space); + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ + + ibuf_data = fil_space_get_ibuf_data(0); mutex_enter(&ibuf_mutex); @@ -2443,7 +2833,7 @@ ibuf_delete_rec( fprintf(stderr, "InnoDB: ERROR: Submit the output to http://bugs.mysql.com\n" "InnoDB: ibuf cursor restoration fails!\n" - "InnoDB: ibuf record inserted to page %lu\n", page_no); + "InnoDB: ibuf record inserted to page %lu\n", (ulong) page_no); fflush(stderr); rec_print(stderr, btr_pcur_get_rec(pcur)); @@ -2453,18 +2843,22 @@ ibuf_delete_rec( rec_print(stderr, page_rec_get_next(btr_pcur_get_rec(pcur))); fflush(stderr); - mtr_commit(mtr); + btr_pcur_commit_specify_mtr(pcur, mtr); fputs("InnoDB: Validating insert buffer tree:\n", stderr); ut_a(btr_validate_tree(ibuf_data->index->tree)); fprintf(stderr, "InnoDB: ibuf tree ok\n"); fflush(stderr); + + btr_pcur_close(pcur); + + mutex_exit(&ibuf_mutex); + + return(TRUE); } - - ut_a(success); - root = ibuf_tree_root_get(ibuf_data, space, mtr); + root = ibuf_tree_root_get(ibuf_data, 0, mtr); btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), FALSE, mtr); @@ -2500,7 +2894,11 @@ ibuf_merge_or_delete_for_page( page_t* page, /* in: if page has been read from disk, pointer to the page x-latched, else NULL */ ulint space, /* in: space id of the index page */ - ulint page_no)/* in: page number of the index page */ + ulint page_no,/* in: page number of the index page */ + ibool update_ibuf_bitmap)/* in: normally this is set to TRUE, but if + we have deleted or are deleting the tablespace, then we + naturally do not want to update a non-existent bitmap + page */ { mem_heap_t* heap; btr_pcur_t pcur; @@ -2514,6 +2912,7 @@ ibuf_merge_or_delete_for_page( #ifdef UNIV_IBUF_DEBUG ulint volume; #endif + ibool tablespace_being_deleted = FALSE; ibool corruption_noticed = FALSE; mtr_t mtr; @@ -2521,7 +2920,7 @@ ibuf_merge_or_delete_for_page( return; } - + #ifdef UNIV_LOG_DEBUG if (space % 2 != 0) { @@ -2535,28 +2934,57 @@ ibuf_merge_or_delete_for_page( return; } - mtr_start(&mtr); + if (update_ibuf_bitmap) { + /* If the following returns FALSE, we get the counter + incremented, and must decrement it when we leave this + function. When the counter is > 0, that prevents tablespace + from being dropped. */ - bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); + tablespace_being_deleted = fil_inc_pending_ibuf_merges(space); + + if (tablespace_being_deleted) { + /* Do not try to read the bitmap page from space; + just delete the ibuf records for the page */ + + page = NULL; + update_ibuf_bitmap = FALSE; + } + } + + if (update_ibuf_bitmap) { + mtr_start(&mtr); + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); - if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no, + if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no, IBUF_BITMAP_BUFFERED, &mtr)) { - /* No inserts buffered for this page */ + /* No inserts buffered for this page */ + mtr_commit(&mtr); - mtr_commit(&mtr); + if (!tablespace_being_deleted) { + fil_decr_pending_ibuf_merges(space); + } - return; + return; + } + mtr_commit(&mtr); } - mtr_commit(&mtr); + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ - ibuf_data = fil_space_get_ibuf_data(space); + ibuf_data = fil_space_get_ibuf_data(0); ibuf_enter(); heap = mem_heap_create(512); - search_tuple = ibuf_search_tuple_build(page_no, heap); + if (!trx_sys_multiple_tablespace_format) { + ut_a(trx_doublewrite_must_reset_space_ids); + search_tuple = ibuf_search_tuple_build(space, page_no, heap); + } else { + search_tuple = ibuf_new_search_tuple_build(space, page_no, + heap); + } if (page) { /* Move the ownership of the x-latch on the page to this OS @@ -2596,7 +3024,8 @@ ibuf_merge_or_delete_for_page( "InnoDB: merge for this page. Please run CHECK TABLE on your tables\n" "InnoDB: to determine if they are corrupt after this.\n\n" "InnoDB: Please submit a detailed bug report to http://bugs.mysql.com\n\n", - page_no, fil_page_get_type(page)); + (ulong) page_no, + (ulong) fil_page_get_type(page)); } } @@ -2610,7 +3039,7 @@ loop: if (page) { ibool success = buf_page_get_known_nowait(RW_X_LATCH, page, BUF_KEEP_OLD, - IB__FILE__, __LINE__, + __FILE__, __LINE__, &mtr); ut_a(success); #ifdef UNIV_SYNC_DEBUG @@ -2622,7 +3051,6 @@ loop: index page */ btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr); - if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); @@ -2635,23 +3063,14 @@ loop: ibuf_rec = btr_pcur_get_rec(&pcur); /* Check if the entry is for this index page */ - if (ibuf_rec_get_page_no(ibuf_rec) != page_no) { - + if (ibuf_rec_get_page_no(ibuf_rec) != page_no + || ibuf_rec_get_space(ibuf_rec) != space) { if (page) { page_header_reset_last_insert(page, &mtr); } - goto reset_bit; } - /* Do NOT merge to the 4.1 code base! */ - if (trx_sys_downgrading_from_4_1_1) { - fputs( -"InnoDB: Fatal error: you are downgrading from >= 4.1.1 to 4.0, but\n" -"InnoDB: the insert buffer was not empty.\n", stderr); - ut_error; - } - if (corruption_noticed) { fputs("InnoDB: Discarding record\n ", stderr); rec_print(stderr, ibuf_rec); @@ -2665,14 +3084,12 @@ loop: dulint max_trx_id = page_get_max_trx_id( buf_frame_align(ibuf_rec)); - page_update_max_trx_id(page, max_trx_id); entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, heap); #ifdef UNIV_IBUF_DEBUG volume += rec_get_converted_size(entry) + page_dir_calc_reserved_space(1); - ut_a(volume <= 4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif @@ -2699,17 +3116,15 @@ loop: } reset_bit: - #ifdef UNIV_IBUF_DEBUG if (ibuf_count_get(space, page_no) > 0) { - /* btr_print_tree(ibuf_data->index->tree, 100); ibuf_print(); */ } #endif - bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); - - ibuf_bitmap_page_set_bits(bitmap_page, page_no, + if (update_ibuf_bitmap) { + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); + ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_BUFFERED, FALSE, &mtr); if (page) { ulint old_bits = ibuf_bitmap_page_get_bits(bitmap_page, @@ -2720,14 +3135,13 @@ reset_bit: old_bits, new_bits, page_get_max_insert_size_after_reorganize(page, 1)); */ #endif - if (old_bits != new_bits) { - - ibuf_bitmap_page_set_bits(bitmap_page, page_no, + if (old_bits != new_bits) { + ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_FREE, new_bits, &mtr); + } } } - #ifdef UNIV_IBUF_DEBUG /* fprintf(stderr, "Ibuf merge %lu records volume %lu to page no %lu\n", @@ -2735,7 +3149,6 @@ reset_bit: #endif mtr_commit(&mtr); btr_pcur_close(&pcur); - mem_heap_free(heap); /* Protect our statistics keeping from race conditions */ @@ -2746,12 +3159,124 @@ reset_bit: mutex_exit(&ibuf_mutex); + if (update_ibuf_bitmap && !tablespace_being_deleted) { + + fil_decr_pending_ibuf_merges(space); + } + ibuf_exit(); #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(space, page_no) == 0); #endif } +/************************************************************************* +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ + +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space) /* in: space id */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; + rec_t* ibuf_rec; + ulint page_no; + ibool closed; + ibuf_data_t* ibuf_data; + ulint n_inserts; + mtr_t mtr; + + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ + + ibuf_data = fil_space_get_ibuf_data(0); + + heap = mem_heap_create(512); + + /* Use page number 0 to build the search tuple so that we get the + cursor positioned at the first entry for this space id */ + + search_tuple = ibuf_new_search_tuple_build(space, 0, heap); + + n_inserts = 0; +loop: + ibuf_enter(); + + mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for the + space */ + btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + + goto leave_loop; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur, &mtr)); + + ibuf_rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this space */ + if (ibuf_rec_get_space(ibuf_rec) != space) { + + goto leave_loop; + } + + page_no = ibuf_rec_get_page_no(ibuf_rec); + + n_inserts++; + + /* Delete the record from ibuf */ + closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr); + if (closed) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ibuf_exit(); + + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur, &mtr)) { + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + ibuf_exit(); + + goto loop; + } + } + +leave_loop: + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + /* Protect our statistics keeping from race conditions */ + mutex_enter(&ibuf_mutex); + + ibuf_data->n_merges++; + ibuf_data->n_merged_recs += n_inserts; + + mutex_exit(&ibuf_mutex); + + printf("Discarded %lu ibuf entries for space %lu\n", (ulong) n_inserts, + (ulong) space); + + ibuf_exit(); + + mem_heap_free(heap); +} + + /********************************************************************** Validates the ibuf data structures when the caller owns ibuf_mutex. */ @@ -2783,6 +3308,56 @@ ibuf_validate_low(void) } /********************************************************************** +Looks if the insert buffer is empty. */ + +ibool +ibuf_is_empty(void) +/*===============*/ + /* out: TRUE if empty */ +{ + ibuf_data_t* data; + ibool is_empty; + page_t* root; + mtr_t mtr; + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + data = UT_LIST_GET_FIRST(ibuf->data_list); + + mtr_start(&mtr); + + root = ibuf_tree_root_get(data, 0, &mtr); + + if (page_get_n_recs(root) == 0) { + + is_empty = TRUE; + + if (data->empty == FALSE) { + fprintf(stderr, +"InnoDB: Warning: insert buffer tree is empty but the data struct does not\n" +"InnoDB: know it. This condition is legal if the master thread has not yet\n" +"InnoDB: run to completion.\n"); + } + } else { + ut_a(data->empty == FALSE); + + is_empty = FALSE; + } + + mtr_commit(&mtr); + + ut_a(data->space == 0); + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + return(is_empty); +} + +/********************************************************************** Prints info of ibuf. */ void @@ -2800,18 +3375,35 @@ ibuf_print( data = UT_LIST_GET_FIRST(ibuf->data_list); while (data) { + fprintf(file, + "Ibuf for space %lu: size %lu, free list len %lu, seg size %lu,", + (ulong) data->space, (ulong) data->size, + (ulong) data->free_list_len, + (ulong) data->seg_size); + + if (data->empty) { + fputs(" is empty\n", file); + } else { + fputs(" is not empty\n", file); + } fprintf(file, "Ibuf for space %lu: size %lu, free list len %lu, seg size %lu,\n" "%lu inserts, %lu merged recs, %lu merges\n", - data->space, data->size, data->free_list_len, data->seg_size, - data->n_inserts, data->n_merged_recs, data->n_merges); + (ulong) data->space, + (ulong) data->size, + (ulong) data->free_list_len, + (ulong) data->seg_size, + (ulong) data->n_inserts, + (ulong) data->n_merged_recs, + (ulong) data->n_merges); #ifdef UNIV_IBUF_DEBUG for (i = 0; i < IBUF_COUNT_N_PAGES; i++) { if (ibuf_count_get(data->space, i) > 0) { fprintf(stderr, "Ibuf count for page %lu is %lu\n", - i, ibuf_count_get(data->space, i)); + (ulong) i, + (ulong) ibuf_count_get(data->space, i)); } } #endif diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic index fd66c7bf2a3..b0aa0756307 100644 --- a/innobase/include/btr0btr.ic +++ b/innobase/include/btr0btr.ic @@ -188,6 +188,7 @@ btr_node_ptr_get_child_page_no( ulint n_fields; byte* field; ulint len; + ulint page_no; n_fields = rec_get_n_fields(rec); @@ -196,7 +197,16 @@ btr_node_ptr_get_child_page_no( ut_ad(len == 4); - return(mach_read_from_4(field)); + page_no = mach_read_from_4(field); + + if (page_no == 0) { + fprintf(stderr, +"InnoDB: a nonsensical page number 0 in a node ptr record at offset %lu\n", + (unsigned long)(rec - buf_frame_align(rec))); + buf_page_print(buf_frame_align(rec)); + } + + return(page_no); } /****************************************************************** diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h index 9d07dd0de18..81f19af4d40 100644 --- a/innobase/include/btr0pcur.h +++ b/innobase/include/btr0pcur.h @@ -466,6 +466,9 @@ struct btr_pcur_struct{ BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the old_rec record */ + buf_block_t* block_when_stored;/* buffer block when the position was + stored; note that if AWE is on, frames + may move */ dulint modify_clock; /* the modify clock value of the buffer block when the cursor position was stored */ diff --git a/innobase/include/btr0pcur.ic b/innobase/include/btr0pcur.ic index a1db2cc52dd..b553a569bda 100644 --- a/innobase/include/btr0pcur.ic +++ b/innobase/include/btr0pcur.ic @@ -564,7 +564,7 @@ btr_pcur_open_at_index_side( } btr_cur_open_at_index_side(from_left, index, latch_mode, - btr_pcur_get_btr_cur(pcur), mtr); + btr_pcur_get_btr_cur(pcur), mtr); pcur->pos_state = BTR_PCUR_IS_POSITIONED; pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 5ac9c83a5f9..53599d03c73 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri #include "sync0rw.h" #include "hash0hash.h" #include "ut0byte.h" +#include "os0proc.h" /* Flags for flush types */ #define BUF_FLUSH_LRU 1 @@ -58,23 +59,34 @@ extern ibool buf_debug_prints;/* If this is set TRUE, the program occurs */ /************************************************************************ -Initializes the buffer pool of the database. */ +Creates the buffer pool. */ -void +buf_pool_t* buf_pool_init( /*==========*/ - ulint max_size, /* in: maximum size of the pool in blocks */ - ulint curr_size); /* in: current size to use, must be <= + /* out, own: buf_pool object, NULL if not + enough memory or error */ + ulint max_size, /* in: maximum size of the buf_pool in + blocks */ + ulint curr_size, /* in: current size to use, must be <= + max_size, currently must be equal to max_size */ + ulint n_frames); /* in: number of frames; if AWE is used, + this is the size of the address space window + where physical memory pages are mapped; if + AWE is not used then this must be the same + as max_size */ /************************************************************************* -Gets the current size of buffer pool in bytes. */ +Gets the current size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_curr_size(void); /*========================*/ /* out: size in bytes */ /************************************************************************* -Gets the maximum size of buffer pool in bytes. */ +Gets the maximum size of buffer pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_max_size(void); @@ -118,7 +130,7 @@ to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed in LA! */ #define buf_page_get(SP, OF, LA, MTR) buf_page_get_gen(\ SP, OF, LA, NULL,\ - BUF_GET, IB__FILE__, __LINE__, MTR) + BUF_GET, __FILE__, __LINE__, MTR) /****************************************************************** Use these macros to bufferfix a page with no latching. Remember not to read the contents of the page unless you know it is safe. Do not modify @@ -127,19 +139,19 @@ error-prone programming not to set a latch, and it should be used with care. */ #define buf_page_get_with_no_latch(SP, OF, MTR) buf_page_get_gen(\ SP, OF, RW_NO_LATCH, NULL,\ - BUF_GET_NO_LATCH, IB__FILE__, __LINE__, MTR) + BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR) /****************************************************************** NOTE! The following macros should be used instead of buf_page_get_gen, to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */ #define buf_page_get_nowait(SP, OF, LA, MTR) buf_page_get_gen(\ SP, OF, LA, NULL,\ - BUF_GET_NOWAIT, IB__FILE__, __LINE__, MTR) + BUF_GET_NOWAIT, __FILE__, __LINE__, MTR) /****************************************************************** NOTE! The following macros should be used instead of buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */ -#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\ - LA, G, MC, IB__FILE__, __LINE__, MTR) +#define buf_page_optimistic_get(LA, BL, G, MC, MTR) buf_page_optimistic_get_func(\ + LA, BL, G, MC, __FILE__, __LINE__, MTR) /************************************************************************ This is the general function used to get optimistic access to a database page. */ @@ -149,10 +161,12 @@ buf_page_optimistic_get_func( /*=========================*/ /* out: TRUE if success */ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ - buf_frame_t* guess, /* in: guessed frame */ + buf_block_t* block, /* in: guessed block */ + buf_frame_t* guess, /* in: guessed frame; note that AWE may move + frames */ dulint modify_clock,/* in: modify clock value if mode is ..._GUESS_ON_CLOCK */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr); /* in: mini-transaction */ /************************************************************************ @@ -185,7 +199,7 @@ buf_page_get_known_nowait( ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ buf_frame_t* guess, /* in: the known page frame */ ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr); /* in: mini-transaction */ /************************************************************************ @@ -201,7 +215,7 @@ buf_page_get_gen( buf_frame_t* guess, /* in: guessed frame or NULL */ ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, BUF_GET_NO_LATCH */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr); /* in: mini-transaction */ /************************************************************************ @@ -350,6 +364,16 @@ buf_frame_modify_clock_inc( /* out: new value */ buf_frame_t* frame); /* in: pointer to a frame */ /************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +dulint +buf_block_modify_clock_inc( +/*=======================*/ + /* out: new value */ + buf_block_t* block); /* in: block */ +/************************************************************************ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE @@ -441,7 +465,7 @@ UNIV_INLINE buf_frame_t* buf_frame_align( /*============*/ - /* out: pointer to block */ + /* out: pointer to frame */ byte* ptr); /* in: pointer to a frame */ /*********************************************************************** Checks if a pointer points to the block array of the buffer pool (blocks, not @@ -524,6 +548,19 @@ buf_pool_invalidate(void); --------------------------- LOWER LEVEL ROUTINES ------------------------- =========================================================================*/ +/************************************************************************ +Maps the page of block to a frame, if not mapped yet. Unmaps some page +from the end of the awe_LRU_free_mapped. */ + +void +buf_awe_map_page_to_frame( +/*======================*/ + buf_block_t* block, /* in: block whose page should be + mapped to a frame */ + ibool add_to_mapped_list);/* in: TRUE if we in the case + we need to map the page should also + add the block to the + awe_LRU_free_mapped list */ #ifdef UNIV_SYNC_DEBUG /************************************************************************* Adds latch level info for the rw-lock protecting the buffer frame. This @@ -589,19 +626,27 @@ buf_pool_get_nth_block( ulint i); /* in: index of the block */ /************************************************************************ Function which inits a page for read to the buffer buf_pool. If the page is -already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and -sets a non-recursive exclusive lock on the buffer frame. The io-handler must -take care that the flag is cleared and the lock released later. This is one -of the functions which perform the state transition NOT_USED => FILE_PAGE to -a block (the other is buf_page_create). */ +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. This is one of the functions which perform the +state transition NOT_USED => FILE_PAGE to a block (the other is +buf_page_create). */ buf_block_t* buf_page_init_for_read( /*===================*/ - /* out: pointer to the block */ - ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ - ulint space, /* in: space id */ - ulint offset);/* in: page number */ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ib_longlong tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset);/* in: page number */ /************************************************************************ Completes an asynchronous read or write request of a file page to or from the buffer pool. */ @@ -658,7 +703,16 @@ struct buf_block_struct{ byte* frame; /* pointer to buffer frame which is of size UNIV_PAGE_SIZE, and aligned to an address divisible by - UNIV_PAGE_SIZE */ + UNIV_PAGE_SIZE; if AWE is used, this + will be NULL for the pages which are + currently not mapped into the virtual + address space window of the buffer + pool */ + os_awe_t* awe_info; /* if AWE is used, then an array of + awe page infos for + UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE + (normally = 4) physical memory + pages; otherwise NULL */ ulint space; /* space id of the page */ ulint offset; /* page number within the space */ ulint lock_hash_val; /* hashed value of the page address @@ -667,14 +721,6 @@ struct buf_block_struct{ record lock hash table */ rw_lock_t lock; /* read-write lock of the buffer frame */ - rw_lock_t read_lock; /* rw-lock reserved when a page read - to the frame is requested; a thread - can wait for this rw-lock if it wants - to wait for the read to complete; - the usual way is to wait for lock, - but if the thread just wants a - bufferfix and no latch on the page, - then it can wait for this rw-lock */ buf_block_t* hash; /* node used in chaining to the page hash table */ ibool check_index_page_at_flush; @@ -709,8 +755,16 @@ struct buf_block_struct{ UT_LIST_NODE_T(buf_block_t) free; /* node of the free block list */ + ibool in_free_list; /* TRUE if in the free list; used in + debugging */ UT_LIST_NODE_T(buf_block_t) LRU; /* node of the LRU list */ + UT_LIST_NODE_T(buf_block_t) awe_LRU_free_mapped; + /* in the AWE version node in the + list of free and LRU blocks which are + mapped to a frame */ + ibool in_LRU_list; /* TRUE of the page is in the LRU list; + used in debugging */ ulint LRU_position; /* value which monotonically decreases (or may stay constant if the block is in the old blocks) toward @@ -771,6 +825,9 @@ struct buf_block_struct{ complete, though: there may have been hash collisions, record deletions, etc. */ + ulint n_pointers; /* used in debugging: the number of + pointers in the adaptive hash index + pointing to this frame */ ulint curr_n_fields; /* prefix length for hash indexing: number of full fields */ ulint curr_n_bytes; /* number of bytes in hash indexing */ @@ -802,16 +859,36 @@ struct buf_pool_struct{ struct and control blocks, except the read-write lock in them */ byte* frame_mem; /* pointer to the memory area which - was allocated for the frames */ + was allocated for the frames; in AWE + this is the virtual address space + window where we map pages stored + in physical memory */ byte* frame_zero; /* pointer to the first buffer frame: this may differ from frame_mem, because this is aligned by the frame size */ - byte* high_end; /* pointer to the end of the - buffer pool */ + byte* high_end; /* pointer to the end of the buffer + frames */ + ulint n_frames; /* number of frames */ buf_block_t* blocks; /* array of buffer control blocks */ + buf_block_t** blocks_of_frames;/* inverse mapping which can be used + to retrieve the buffer control block + of a frame; this is an array which + lists the blocks of frames in the + order frame_zero, + frame_zero + UNIV_PAGE_SIZE, ... + a control block is always assigned + for each frame, even if the frame does + not contain any data; note that in AWE + there are more control blocks than + buffer frames */ + os_awe_t* awe_info; /* if AWE is used, AWE info for the + physical 4 kB memory pages associated + with buffer frames */ ulint max_size; /* number of control blocks == maximum pool size in pages */ - ulint curr_size; /* current pool size in pages */ + ulint curr_size; /* current pool size in pages; + currently always the same as + max_size */ hash_table_t* page_hash; /* hash table of the file pages */ ulint n_pend_reads; /* number of pending read operations */ @@ -828,6 +905,9 @@ struct buf_pool_struct{ counted as page gets; this field is NOT protected by the buffer pool mutex */ + ulint n_pages_awe_remapped; /* if AWE is enabled, the + number of remaps of blocks to + buffer frames */ ulint n_page_gets_old;/* n_page_gets when buf_print was last time called: used to calculate hit rate */ @@ -836,6 +916,7 @@ struct buf_pool_struct{ ulint n_pages_written_old;/* number write operations */ ulint n_pages_created_old;/* number of pages created in the pool with no read */ + ulint n_pages_awe_remapped_old; /* 2. Page flushing algorithm fields */ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; @@ -868,7 +949,10 @@ struct buf_pool_struct{ /* 3. LRU replacement algorithm fields */ UT_LIST_BASE_NODE_T(buf_block_t) free; - /* base node of the free block list */ + /* base node of the free block list; + in the case of AWE, at the start are + always free blocks for which the + physical memory is mapped to a frame */ UT_LIST_BASE_NODE_T(buf_block_t) LRU; /* base node of the LRU list */ buf_block_t* LRU_old; /* pointer to the about 3/8 oldest @@ -880,6 +964,12 @@ struct buf_pool_struct{ see buf0lru.c for the restrictions on this value; not defined if LRU_old == NULL */ + UT_LIST_BASE_NODE_T(buf_block_t) awe_LRU_free_mapped; + /* list of those blocks which are + in the LRU list or the free list, and + where the page is mapped to a frame; + thus, frames allocated, e.g., to the + locki table, are not in this list */ }; /* States of a control block */ diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic index 16deade0901..681a0ef000a 100644 --- a/innobase/include/buf0buf.ic +++ b/innobase/include/buf0buf.ic @@ -28,7 +28,6 @@ buf_block_peek_if_too_old( { if (buf_pool->freed_page_clock >= block->freed_page_clock + 1 + (buf_pool->curr_size / 1024)) { - return(TRUE); } @@ -36,25 +35,27 @@ buf_block_peek_if_too_old( } /************************************************************************* -Gets the current size of buffer buf_pool in bytes. */ +Gets the current size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_curr_size(void) /*========================*/ /* out: size in bytes */ { - return((buf_pool->curr_size) * UNIV_PAGE_SIZE); + return((buf_pool->n_frames) * UNIV_PAGE_SIZE); } /************************************************************************* -Gets the maximum size of buffer buf_pool in bytes. */ +Gets the maximum size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_max_size(void) /*=======================*/ /* out: size in bytes */ { - return((buf_pool->max_size) * UNIV_PAGE_SIZE); + return((buf_pool->n_frames) * UNIV_PAGE_SIZE); } /*********************************************************************** @@ -169,7 +170,7 @@ buf_block_get_space( ut_ad(block); ut_ad(block >= buf_pool->blocks); ut_ad(block < buf_pool->blocks + buf_pool->max_size); - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); return(block->space); @@ -187,7 +188,7 @@ buf_block_get_page_no( ut_ad(block); ut_ad(block >= buf_pool->blocks); ut_ad(block < buf_pool->blocks + buf_pool->max_size); - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_ad(block->buf_fix_count > 0); return(block->offset); @@ -209,54 +210,24 @@ buf_block_align( frame_zero = buf_pool->frame_zero; - ut_ad((ulint)ptr >= (ulint)frame_zero); - - block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero)) - >> UNIV_PAGE_SIZE_SHIFT); - if (block < buf_pool->blocks - || block >= buf_pool->blocks + buf_pool->max_size) { + if ((ulint)ptr < (ulint)frame_zero + || (ulint)ptr > (ulint)(buf_pool->high_end)) { + ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Error: trying to access a stray pointer %p\n" -"InnoDB: buf pool start is at %p, number of pages %lu\n", ptr, - frame_zero, buf_pool->max_size); +"InnoDB: buf pool start is at %p, end at %p\n" +"InnoDB: Probable reason is database corruption or memory\n" +"InnoDB: corruption. If this happens in an InnoDB database recovery,\n" +"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n" +"InnoDB: how to force recovery.\n", + ptr, frame_zero, + buf_pool->high_end); ut_error; } - - return(block); -} - -/*********************************************************************** -Gets the block to whose frame the pointer is pointing to. Does not -require a file page to be bufferfixed. */ -UNIV_INLINE -buf_block_t* -buf_block_align_low( -/*================*/ - /* out: pointer to block */ - byte* ptr) /* in: pointer to a frame */ -{ - buf_block_t* block; - buf_frame_t* frame_zero; - - ut_ad(ptr); - - frame_zero = buf_pool->frame_zero; - - ut_ad((ulint)ptr >= (ulint)frame_zero); - - block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero)) - >> UNIV_PAGE_SIZE_SHIFT); - if (block < buf_pool->blocks - || block >= buf_pool->blocks + buf_pool->max_size) { - - fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %p\n" -"InnoDB: buf pool start is at %p, number of pages %lu\n", ptr, - frame_zero, buf_pool->max_size); - ut_error; - } - + + block = *(buf_pool->blocks_of_frames + (((ulint)(ptr - frame_zero)) + >> UNIV_PAGE_SIZE_SHIFT)); return(block); } @@ -266,7 +237,7 @@ UNIV_INLINE buf_frame_t* buf_frame_align( /*============*/ - /* out: pointer to block */ + /* out: pointer to frame */ byte* ptr) /* in: pointer to a frame */ { buf_frame_t* frame; @@ -275,14 +246,19 @@ buf_frame_align( frame = ut_align_down(ptr, UNIV_PAGE_SIZE); - if (((ulint)frame - < (ulint)(buf_pool->frame_zero)) - || ((ulint)frame > (ulint)(buf_pool_get_nth_block(buf_pool, - buf_pool->max_size - 1)->frame))) { + if (((ulint)frame < (ulint)(buf_pool->frame_zero)) + || (ulint)frame >= (ulint)(buf_pool->high_end)) { + + ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Error: trying to access a stray pointer %p\n" -"InnoDB: buf pool start is at %p, number of pages %lu\n", ptr, - buf_pool->frame_zero, buf_pool->max_size); +"InnoDB: buf pool start is at %p, end at %p\n" +"InnoDB: Probable reason is database corruption or memory\n" +"InnoDB: corruption. If this happens in an InnoDB database recovery,\n" +"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n" +"InnoDB: how to force recovery.\n", + ptr, buf_pool->frame_zero, + buf_pool->high_end); ut_error; } @@ -471,11 +447,32 @@ buf_frame_modify_clock_inc( ut_ad(frame); - block = buf_block_align_low(frame); + block = buf_block_align(frame); #ifdef UNIV_SYNC_DEBUG ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /*UNIV_SYNC_DEBUG */ + + UT_DULINT_INC(block->modify_clock); + + return(block->modify_clock); +} + +/************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +dulint +buf_block_modify_clock_inc( +/*=======================*/ + /* out: new value */ + buf_block_t* block) /* in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ UT_DULINT_INC(block->modify_clock); @@ -515,15 +512,16 @@ void buf_block_buf_fix_inc_debug( /*========================*/ buf_block_t* block, /* in: block to bufferfix */ - char* file, /* in: file name */ - ulint line) /* in: line */ + const char* file __attribute__ ((unused)), /* in: file name */ + ulint line __attribute__ ((unused))) /* in: line */ { +#ifdef UNIV_SYNC_DEBUG ibool ret; - + ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ut_ad(ret == TRUE); - +#endif block->buf_fix_count++; } #else /* UNIV_SYNC_DEBUG */ @@ -562,6 +560,8 @@ buf_page_hash_get( HASH_SEARCH(hash, buf_pool->page_hash, fold, block, (block->space == space) && (block->offset == offset)); + ut_a(block == NULL || block->state == BUF_BLOCK_FILE_PAGE); + return(block); } @@ -589,7 +589,7 @@ buf_page_get_release_on_io( frame = buf_page_get_gen(space, offset, rw_latch, guess, BUF_GET_IF_IN_POOL, - IB__FILE__, __LINE__, + __FILE__, __LINE__, mtr); if (frame != NULL) { @@ -629,8 +629,8 @@ buf_page_release( mutex_enter_fast(&(buf_pool->mutex)); - ut_ad(block->state == BUF_BLOCK_FILE_PAGE); - ut_ad(block->buf_fix_count > 0); + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(block->buf_fix_count > 0); if (rw_latch == RW_X_LATCH && mtr->modifications) { diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h index eb9d43d3b93..69a376f8cab 100644 --- a/innobase/include/buf0lru.h +++ b/innobase/include/buf0lru.h @@ -37,6 +37,16 @@ These are low-level functions #define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA) /********************************************************************** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. A PROBLEM: if readahead is being started, +what guarantees that it will not try to read in pages after this operation has +completed? */ + +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id); /* in: space id */ +/********************************************************************** Gets the minimum LRU_position field for the blocks in an initial segment (determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not guaranteed to be precise, because the ulint_clock may wrap around. */ @@ -67,7 +77,9 @@ LRU list to the free list. */ buf_block_t* buf_LRU_get_free_block(void); /*=========================*/ - /* out: the free control block */ + /* out: the free control block; also if AWE is + used, it is guaranteed that the block has its + page mapped to a frame when we return */ /********************************************************************** Puts a block back to the free list. */ diff --git a/innobase/include/buf0rea.h b/innobase/include/buf0rea.h index aed965a6b21..380a42f4b80 100644 --- a/innobase/include/buf0rea.h +++ b/innobase/include/buf0rea.h @@ -59,7 +59,7 @@ buf_read_ahead_linear( must want access to this page (see NOTE 3 above) */ /************************************************************************ Issues read requests for pages which the ibuf module wants to read in, in -order to contract insert buffer trees. Technically, this function is like +order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ void @@ -68,9 +68,14 @@ buf_read_ibuf_merge_pages( ibool sync, /* in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ - ulint space, /* in: space id */ - ulint* page_nos, /* in: array of page numbers to read, with - the highest page number last in the array */ + ulint* space_ids, /* in: array of space ids */ + ib_longlong* space_versions,/* in: the spaces must have this version + number (timestamp), otherwise we discard the + read; we use this to cancel reads if + DISCARD + IMPORT may have changed the + tablespace size */ + ulint* page_nos, /* in: array of page numbers to read, with the + highest page number the last in the array */ ulint n_stored); /* in: number of page numbers in the array */ /************************************************************************ Issues read requests for pages which recovery wants to read in. */ diff --git a/innobase/include/data0data.h b/innobase/include/data0data.h index 80207631dd9..2136de0f9b3 100644 --- a/innobase/include/data0data.h +++ b/innobase/include/data0data.h @@ -86,7 +86,7 @@ void dfield_set_data( /*============*/ dfield_t* field, /* in: field */ - void* data, /* in: data */ + const void* data, /* in: data */ ulint len); /* in: length or UNIV_SQL_NULL */ /************************************************************************** Writes an SQL null field full of zeros. */ diff --git a/innobase/include/data0data.ic b/innobase/include/data0data.ic index 697a272ccd6..0769372e16f 100644 --- a/innobase/include/data0data.ic +++ b/innobase/include/data0data.ic @@ -93,12 +93,12 @@ void dfield_set_data( /*============*/ dfield_t* field, /* in: field */ - void* data, /* in: data */ + const void* data, /* in: data */ ulint len) /* in: length or UNIV_SQL_NULL */ { ut_ad(field); - field->data = data; + field->data = (void*) data; field->len = len; } diff --git a/innobase/include/data0type.h b/innobase/include/data0type.h index 4da686bf2e1..fe38a224a66 100644 --- a/innobase/include/data0type.h +++ b/innobase/include/data0type.h @@ -11,6 +11,9 @@ Created 1/16/1996 Heikki Tuuri #include "univ.i" +extern ulint data_mysql_default_charset_coll; +extern ulint data_mysql_latin1_swedish_charset_coll; + /* SQL data type struct */ typedef struct dtype_struct dtype_t; @@ -18,31 +21,79 @@ typedef struct dtype_struct dtype_t; data type */ extern dtype_t* dtype_binary; -/* Data main types of SQL data */ -#define DATA_VARCHAR 1 /* character varying */ -#define DATA_CHAR 2 /* fixed length character */ +/*-------------------------------------------*/ +/* The 'MAIN TYPE' of a column */ +#define DATA_VARCHAR 1 /* character varying of the + latin1_swedish_ci charset-collation */ +#define DATA_CHAR 2 /* fixed length character of the + latin1_swedish_ci charset-collation */ #define DATA_FIXBINARY 3 /* binary string of fixed length */ #define DATA_BINARY 4 /* binary string */ -#define DATA_BLOB 5 /* binary large object, or a TEXT type; if - prtype & DATA_NONLATIN1 != 0 the data must - be compared by MySQL as a whole field; if - prtype & DATA_BINARY_TYPE == 0, then this is - actually a TEXT column */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; + if prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column (or a BLOB created + with < 4.0.14) */ #define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ #define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ #define DATA_SYS 8 /* system column */ + /* Data types >= DATA_FLOAT must be compared using the whole field, not as binary strings */ + #define DATA_FLOAT 9 #define DATA_DOUBLE 10 #define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ -#define DATA_VARMYSQL 12 /* non-latin1 varying length char */ -#define DATA_MYSQL 13 /* non-latin1 fixed length char */ +#define DATA_VARMYSQL 12 /* any charset varying length char */ +#define DATA_MYSQL 13 /* any charset fixed length char */ + /* NOTE that 4.1.1 used DATA_MYSQL and + DATA_VARMYSQL for all character sets, and the + charset-collation for tables created with it + can also be latin1_swedish_ci */ #define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() requires the values are <= 63 */ /*-------------------------------------------*/ -/* In the lowest byte in the precise type we store the MySQL type code -(not applicable for system columns). */ +/* The 'PRECISE TYPE' of a column */ +/* +Tables created by a MySQL user have the following convention: + +- In the least significant byte in the precise type we store the MySQL type +code (not applicable for system columns). + +- In the second least significant byte we OR flags DATA_NOT_NULL, +DATA_UNSIGNED, DATA_BINARY_TYPE. + +- In the third least significant byte of the precise type of string types we +store the MySQL charset-collation code. In DATA_BLOB columns created with +< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there +are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no +problem, though. + +Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the +precise type, since the charset was always the default charset of the MySQL +installation. If the stored charset code is 0 in the system table SYS_COLUMNS +of InnoDB, that means that the default charset of this MySQL installation +should be used. + +When loading a table definition from the system tables to the InnoDB data +dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check +if the stored charset-collation is 0, and if that is the case and the type is +a non-binary string, replace that 0 by the default charset-collation code of +this MySQL installation. In short, in old tables, the charset-collation code +in the system tables on disk can be 0, but in in-memory data structures +(dtype_t), the charset-collation code is always != 0 for non-binary string +types. + +In new tables, in binary string types, the charset-collation code is the +MySQL code for the 'binary charset', that is, != 0. + +For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those +DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci, +InnoDB performs all comparisons internally, without resorting to the MySQL +comparison functions. This is to save CPU time. + +InnoDB's own internal system tables have different precise types for their +columns, and for them the precise type is usually not used at all. +*/ #define DATA_ENGLISH 4 /* English language character string: this is a relic from pre-MySQL time and only used @@ -69,7 +120,7 @@ be less than 256 */ #define DATA_MIX_ID_LEN 9 /* maximum stored length for mix id (in a compressed dulint form) */ #define DATA_N_SYS_COLS 4 /* number of system columns defined above */ -/*-------------------------------------------*/ + /* Flags ORed to the precise data type */ #define DATA_NOT_NULL 256 /* this is ORed to the precise type when the column is declared as NOT NULL */ @@ -79,17 +130,52 @@ be less than 256 */ string, this is ORed to the precise type: this only holds for tables created with >= MySQL-4.0.14 */ -#define DATA_NONLATIN1 2048 /* if the data type is a DATA_BLOB (actually - TEXT) of a non-latin1 type, this is ORed to - the precise type: this only holds for tables - created with >= MySQL-4.0.14 */ +/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1. + In earlier versions this was set for some + BLOB columns. +*/ /*-------------------------------------------*/ /* This many bytes we need to store the type information affecting the alphabetical order for a single field and decide the storage size of an SQL null*/ -#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we add 2 bytes more so that we can also +store the charset-collation number; one byte is left unused, though */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 + +/************************************************************************* +Checks if a data main type is a string type. Also a BLOB is considered a +string type. */ + +ibool +dtype_is_string_type( +/*=================*/ + /* out: TRUE if string type */ + ulint mtype); /* in: InnoDB main data type code: DATA_CHAR, ... */ +/************************************************************************* +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. */ + +ibool +dtype_is_binary_string_type( +/*========================*/ + /* out: TRUE if binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype);/* in: precise type */ +/************************************************************************* +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. */ +ibool +dtype_is_non_binary_string_type( +/*============================*/ + /* out: TRUE if non-binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype);/* in: precise type */ /************************************************************************* Sets a data type structure. */ UNIV_INLINE @@ -124,6 +210,23 @@ dtype_get_prtype( /*=============*/ dtype_t* type); /************************************************************************* +Gets the MySQL charset-collation code for MySQL string types. */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype);/* in: precise data type */ +/************************************************************************* +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. */ + +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /* in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll); /* in: MySQL charset-collation code */ +/************************************************************************* Gets the type length. */ UNIV_INLINE ulint @@ -172,24 +275,37 @@ dtype_is_fixed_size( /* out: TRUE if fixed size */ dtype_t* type); /* in: type */ /************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /* in: type struct */ + byte* buf); /* in: buffer for the stored order info */ +/************************************************************************** Stores for a type the information which determines its alphabetical ordering -and the storage size of an SQL NULL value. */ +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ UNIV_INLINE void -dtype_store_for_order_and_null_size( -/*================================*/ - byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /* in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE bytes where we store the info */ dtype_t* type); /* in: type struct */ /************************************************************************** Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. */ +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ UNIV_INLINE void -dtype_read_for_order_and_null_size( -/*===============================*/ +dtype_new_read_for_order_and_null_size( +/*===================================*/ dtype_t* type, /* in: type struct */ - byte* buf); /* in: buffer for the stored order info */ + byte* buf); /* in: buffer for stored type order info */ + /************************************************************************* Validates a data type structure. */ @@ -212,7 +328,7 @@ struct dtype_struct{ ulint mtype; /* main data type */ ulint prtype; /* precise type; MySQL data type */ - /* remaining two fields do not affect alphabetical ordering: */ + /* the remaining two fields do not affect alphabetical ordering: */ ulint len; /* length */ ulint prec; /* precision */ diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic index dbc5b6615f6..946b646ffbf 100644 --- a/innobase/include/data0type.ic +++ b/innobase/include/data0type.ic @@ -72,6 +72,17 @@ dtype_get_prtype( } /************************************************************************* +Gets the MySQL charset-collation code for MySQL string types. */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype) /* in: precise data type */ +{ + return((prtype >> 16) & 0xFFUL); +} + +/************************************************************************* Gets the type length. */ UNIV_INLINE ulint @@ -127,35 +138,44 @@ dtype_get_pad_char( /************************************************************************** Stores for a type the information which determines its alphabetical ordering -and the storage size of an SQL NULL value. */ +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ UNIV_INLINE void -dtype_store_for_order_and_null_size( -/*================================*/ - byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /* in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE bytes where we store the info */ dtype_t* type) /* in: type struct */ { - ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); + ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - buf[0] = (byte)(type->mtype & 0xFF); + buf[0] = (byte)(type->mtype & 0xFFUL); if (type->prtype & DATA_BINARY_TYPE) { buf[0] = buf[0] | 128; } - if (type->prtype & DATA_NONLATIN1) { - buf[0] = buf[0] | 64; - } + /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { + buf[0] = buf[0] | 64; + } + */ + + buf[1] = (byte)(type->prtype & 0xFFUL); - buf[1] = (byte)(type->prtype & 0xFF); + mach_write_to_2(buf + 2, type->len & 0xFFFFUL); - mach_write_to_2(buf + 2, type->len & 0xFFFF); + mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); + + /* Note that the second last byte is left unused, because the + charset-collation code is always < 256 */ } /************************************************************************** Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. */ +ordering and the storage size of an SQL NULL value. This is the < 4.1.x +storage format. */ UNIV_INLINE void dtype_read_for_order_and_null_size( @@ -172,12 +192,56 @@ dtype_read_for_order_and_null_size( type->prtype = type->prtype | DATA_BINARY_TYPE; } - if (buf[0] & 64) { - type->prtype = type->prtype | DATA_NONLATIN1; + type->len = mach_read_from_2(buf + 2); + + type->prtype = dtype_form_prtype(type->prtype, + data_mysql_default_charset_coll); +} + +/************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the >= 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /* in: type struct */ + byte* buf) /* in: buffer for stored type order info */ +{ + ulint charset_coll; + + ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype = type->prtype | DATA_BINARY_TYPE; } type->len = mach_read_from_2(buf + 2); -} + + mach_read_from_2(buf + 4); + + charset_coll = mach_read_from_2(buf + 4); + + if (dtype_is_string_type(type->mtype)) { + ut_a(charset_coll < 256); + + if (charset_coll == 0) { + /* This insert buffer record was inserted with MySQL + version < 4.1.2, and the charset-collation code was not + explicitly stored to dtype->prtype at that time. It + must be the default charset-collation of this MySQL + installation. */ + + charset_coll = data_mysql_default_charset_coll; + } + + type->prtype = dtype_form_prtype(type->prtype, charset_coll); + } +} /*************************************************************************** Returns the size of a fixed size data type, 0 if not a fixed size type. */ diff --git a/innobase/include/db0err.h b/innobase/include/db0err.h index 854b9794c00..be7667bfd0c 100644 --- a/innobase/include/db0err.h +++ b/innobase/include/db0err.h @@ -48,6 +48,11 @@ Created 5/24/1996 Heikki Tuuri from a table failed */ #define DB_NO_SAVEPOINT 42 /* no savepoint exists with the given name */ +#define DB_TABLESPACE_ALREADY_EXISTS 43 /* we cannot create a new single-table + tablespace because a file of the same + name already exists */ +#define DB_TABLESPACE_DELETED 44 /* tablespace does not exist or is + being dropped right now */ /* The following are partial failure codes */ #define DB_FAIL 1000 diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h index cb631be7e35..35eff5af29a 100644 --- a/innobase/include/dict0boot.h +++ b/innobase/include/dict0boot.h @@ -93,7 +93,7 @@ dict_create(void); indexes; ibuf tables and indexes are assigned as the id the number DICT_IBUF_ID_MIN plus the space id */ -#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFF, 0) +#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFFUL, 0) /* The offset of the dictionary header on the page */ #define DICT_HDR FSEG_PAGE_DATA diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index 835c2c2b2e6..ebb34f7dda0 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -60,6 +60,16 @@ Inits the data dictionary module. */ void dict_init(void); /*===========*/ +/************************************************************************ +Gets the space id of every table of the data dictionary and makes a linear +list and a hash table of them to the data dictionary cache. This function +can be called at database startup if we did not need to do a crash recovery. +In crash recovery we must scan the space id's from the .ibd files in MySQL +database directories. */ + +void +dict_load_space_id_list(void); +/*=========================*/ /************************************************************************* Gets the column data type. */ UNIV_INLINE @@ -156,11 +166,20 @@ dict_table_rename_in_cache( /*=======================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table */ - char* new_name, /* in: new name */ + const char* new_name, /* in: new name */ ibool rename_also_foreigns);/* in: in ALTER TABLE we want to preserve the original table name in constraints which reference it */ /************************************************************************** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ + +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /* in: table object already in cache */ + dulint new_id);/* in: new id to set */ +/************************************************************************** Adds a foreign key constraint object to the dictionary cache. May free the object if there already is an object with the same identifier in. At least one of foreign table or referenced table must already be in @@ -191,16 +210,19 @@ fields than mentioned in the constraint. */ ulint dict_create_foreign_constraints( /*============================*/ - /* out: error code or DB_SUCCESS */ - trx_t* trx, /* in: transaction */ - char* sql_string, /* in: table create statement where - foreign keys are declared like: - FOREIGN KEY (a, b) REFERENCES table2(c, d), - table2 can be written also with the database - name before it: test.table2; the default - database id the database of parameter name */ - char* name); /* in: table full name in the normalized form - database_name/table_name */ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + const char* name); /* in: table full name in the + normalized form + database_name/table_name */ /************************************************************************** Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. */ @@ -227,9 +249,10 @@ directory dict_table_get_low is usually the appropriate function. */ dict_table_t* dict_table_get( /*===========*/ - /* out: table, NULL if does not exist */ - char* table_name, /* in: table name */ - trx_t* trx); /* in: transaction handle */ + /* out: table, NULL if + does not exist */ + const char* table_name, /* in: table name */ + trx_t* trx); /* in: transaction handle */ /************************************************************************** Returns a table object and increments MySQL open handle count on the table. */ @@ -237,9 +260,10 @@ Returns a table object and increments MySQL open handle count on the table. dict_table_t* dict_table_get_and_increment_handle_count( /*======================================*/ - /* out: table, NULL if does not exist */ - char* table_name, /* in: table name */ - trx_t* trx); /* in: transaction handle or NULL */ + /* out: table, NULL if + does not exist */ + const char* table_name, /* in: table name */ + trx_t* trx); /* in: transaction handle or NULL */ /************************************************************************** Returns a table object, based on table id, and memoryfixes it. */ @@ -271,8 +295,8 @@ UNIV_INLINE dict_table_t* dict_table_check_if_in_cache_low( /*==============================*/ - /* out: table, NULL if not found */ - char* table_name); /* in: table name */ + /* out: table, NULL if not found */ + const char* table_name); /* in: table name */ /************************************************************************** Gets a table; loads it to the dictionary cache if necessary. A low-level function. */ @@ -280,8 +304,8 @@ UNIV_INLINE dict_table_t* dict_table_get_low( /*===============*/ - /* out: table, NULL if not found */ - char* table_name); /* in: table name */ + /* out: table, NULL if not found */ + const char* table_name); /* in: table name */ /************************************************************************** Returns an index object. */ UNIV_INLINE @@ -290,7 +314,7 @@ dict_table_get_index( /*=================*/ /* out: index, NULL if does not exist */ dict_table_t* table, /* in: table */ - char* name); /* in: index name */ + const char* name); /* in: index name */ /************************************************************************** Returns an index object. */ @@ -299,7 +323,7 @@ dict_table_get_index_noninline( /*===========================*/ /* out: index, NULL if does not exist */ dict_table_t* table, /* in: table */ - char* name); /* in: index name */ + const char* name); /* in: index name */ /************************************************************************** Prints a table definition. */ @@ -320,7 +344,7 @@ Prints a table data when we know the table name. */ void dict_table_print_by_name( /*=====================*/ - char* name); + const char* name); /************************************************************************** Outputs info on foreign keys of a table. */ @@ -444,6 +468,17 @@ dict_table_get_sys_col_no( /* out: column number */ dict_table_t* table, /* in: table */ ulint sys); /* in: DATA_ROW_ID, ... */ +/************************************************************************ +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. */ + +ibool +dict_table_col_in_clustered_key( +/*============================*/ + /* out: TRUE if the column, or its prefix, is + in the clustered key */ + dict_table_t* table, /* in: table */ + ulint n); /* in: column number */ /*********************************************************************** Copies types of columns contained in table to tuple. */ @@ -714,7 +749,8 @@ dict_tree_build_node_ptr( /*=====================*/ /* out, own: node pointer */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec, /* in: record for which to build node pointer */ + rec_t* rec, /* in: record for which to build node + pointer */ ulint page_no,/* in: page number to put in node pointer */ mem_heap_t* heap, /* in: memory heap where pointer created */ ulint level); /* in: level of rec in tree: 0 means leaf @@ -887,7 +923,7 @@ struct dict_sys_struct{ dict_table_t* sys_columns; /* SYS_COLUMNS table */ dict_table_t* sys_indexes; /* SYS_INDEXES table */ dict_table_t* sys_fields; /* SYS_FIELDS table */ -}; +}; #ifndef UNIV_NONINL #include "dict0dict.ic" diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic index 57ef4b896f5..0f7cc8973db 100644 --- a/innobase/include/dict0dict.ic +++ b/innobase/include/dict0dict.ic @@ -536,8 +536,8 @@ UNIV_INLINE dict_table_t* dict_table_check_if_in_cache_low( /*==============================*/ - /* out: table, NULL if not found */ - char* table_name) /* in: table name */ + /* out: table, NULL if not found */ + const char* table_name) /* in: table name */ { dict_table_t* table; ulint table_fold; @@ -562,8 +562,8 @@ UNIV_INLINE dict_table_t* dict_table_get_low( /*===============*/ - /* out: table, NULL if not found */ - char* table_name) /* in: table name */ + /* out: table, NULL if not found */ + const char* table_name) /* in: table name */ { dict_table_t* table; @@ -642,7 +642,7 @@ dict_table_get_index( /*=================*/ /* out: index, NULL if does not exist */ dict_table_t* table, /* in: table */ - char* name) /* in: index name */ + const char* name) /* in: index name */ { dict_index_t* index = NULL; diff --git a/innobase/include/dict0load.h b/innobase/include/dict0load.h index b60996a8dab..d4dccb33373 100644 --- a/innobase/include/dict0load.h +++ b/innobase/include/dict0load.h @@ -15,14 +15,26 @@ Created 4/24/1996 Heikki Tuuri #include "ut0byte.h" /************************************************************************ +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). +In a normal startup we just scan the biggest space id, and store it to +fil_system. */ + +void +dict_check_tablespaces_or_store_max_id( +/*===================================*/ + ibool in_crash_recovery); /* in: are we doing a crash recovery */ +/************************************************************************ Finds the first table name in the given database. */ char* dict_get_first_table_name_in_db( /*============================*/ - /* out, own: table name, NULL if does not exist; - the caller must free the memory in the string! */ - char* name); /* in: database name which ends to '/' */ + /* out, own: table name, NULL if + does not exist; the caller must free + the memory in the string! */ + const char* name); /* in: database name which ends to '/' */ /************************************************************************ Loads a table definition and also all its index definitions, and also the cluster definition if the table is a member in a cluster. Also loads @@ -32,8 +44,13 @@ a foreign key references columns in this table. */ dict_table_t* dict_load_table( /*============*/ - /* out: table, NULL if does not exist */ - char* name); /* in: table name */ + /* out: table, NULL if does not exist; + if the table is stored in an .ibd file, + but the file does not exist, + then we set the ibd_file_missing flag TRUE + in the table object we return */ + const char* name); /* in: table name in the + databasename/tablename format */ /*************************************************************************** Loads a table object based on the table id. */ @@ -61,8 +78,8 @@ already in the dictionary cache. */ ulint dict_load_foreigns( /*===============*/ - /* out: DB_SUCCESS or error code */ - char* table_name); /* in: table name */ + /* out: DB_SUCCESS or error code */ + const char* table_name); /* in: table name */ /************************************************************************ Prints to the standard output information on all tables found in the data dictionary system table. */ diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index 6d6e95ab511..3fc3e850987 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -48,27 +48,28 @@ Creates a table memory object. */ dict_table_t* dict_mem_table_create( /*==================*/ - /* out, own: table object */ - char* name, /* in: table name */ - ulint space, /* in: space where the clustered index of - the table is placed; this parameter is - ignored if the table is made a member of - a cluster */ - ulint n_cols); /* in: number of columns */ + /* out, own: table object */ + const char* name, /* in: table name */ + ulint space, /* in: space where the clustered index + of the table is placed; this parameter + is ignored if the table is made + a member of a cluster */ + ulint n_cols); /* in: number of columns */ /************************************************************************** Creates a cluster memory object. */ dict_cluster_t* dict_mem_cluster_create( /*====================*/ - /* out, own: cluster object (where the type - dict_cluster_t == dict_table_t) */ - char* name, /* in: cluster name */ - ulint space, /* in: space where the clustered indexes - of the member tables are placed */ - ulint n_cols, /* in: number of columns */ - ulint mix_len); /* in: length of the common key prefix in the - cluster */ + /* out, own: cluster object (where the + type dict_cluster_t == dict_table_t) */ + const char* name, /* in: cluster name */ + ulint space, /* in: space where the clustered + indexes of the member tables are + placed */ + ulint n_cols, /* in: number of columns */ + ulint mix_len); /* in: length of the common key prefix + in the cluster */ /************************************************************************** Declares a non-published table as a member in a cluster. */ @@ -76,7 +77,7 @@ void dict_mem_table_make_cluster_member( /*===============================*/ dict_table_t* table, /* in: non-published table */ - char* cluster_name); /* in: cluster name */ + const char* cluster_name); /* in: cluster name */ /************************************************************************** Adds a column definition to a table. */ @@ -84,7 +85,7 @@ void dict_mem_table_add_col( /*===================*/ dict_table_t* table, /* in: table */ - char* name, /* in: column name */ + const char* name, /* in: column name */ ulint mtype, /* in: main datatype */ ulint prtype, /* in: precise type */ ulint len, /* in: length */ @@ -95,14 +96,15 @@ Creates an index memory object. */ dict_index_t* dict_mem_index_create( /*==================*/ - /* out, own: index object */ - char* table_name, /* in: table name */ - char* index_name, /* in: index name */ - ulint space, /* in: space where the index tree is placed, - ignored if the index is of the clustered - type */ - ulint type, /* in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */ - ulint n_fields); /* in: number of fields */ + /* out, own: index object */ + const char* table_name, /* in: table name */ + const char* index_name, /* in: index name */ + ulint space, /* in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /* in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /* in: number of fields */ /************************************************************************** Adds a field definition to an index. NOTE: does not take a copy of the column name if the field is a column. The memory occupied @@ -112,7 +114,7 @@ void dict_mem_index_add_field( /*=====================*/ dict_index_t* index, /* in: index */ - char* name, /* in: column name */ + const char* name, /* in: column name */ ulint order, /* in: order criterion; 0 means an ascending order */ ulint prefix_len); /* in: 0 or the column prefix length @@ -142,7 +144,7 @@ struct dict_col_struct{ clustered index */ ulint ord_part;/* count of how many times this column appears in ordering fields of an index */ - char* name; /* name */ + const char* name; /* name */ dtype_t type; /* data type */ dict_table_t* table; /* back pointer to table of this column */ ulint aux; /* this is used as an auxiliary variable @@ -154,7 +156,7 @@ struct dict_col_struct{ /* Data structure for a field in an index */ struct dict_field_struct{ dict_col_t* col; /* pointer to the table column */ - char* name; /* name of the column */ + const char* name; /* name of the column */ ulint order; /* flags for ordering this field: DICT_DESCEND, ... */ ulint prefix_len; /* 0 or the length of the column @@ -196,8 +198,8 @@ struct dict_index_struct{ dulint id; /* id of the index */ mem_heap_t* heap; /* memory heap */ ulint type; /* index type */ - char* name; /* index name */ - char* table_name; /* table name */ + const char* name; /* index name */ + const char* table_name; /* table name */ dict_table_t* table; /* back pointer to table */ ulint space; /* space where the index tree is placed */ ulint page_no;/* page number of the index tree root */ @@ -250,12 +252,12 @@ struct dict_foreign_struct{ or DICT_FOREIGN_ON_DELETE_SET_NULL */ char* foreign_table_name;/* foreign table name */ dict_table_t* foreign_table; /* table where the foreign key is */ - char** foreign_col_names;/* names of the columns in the + const char** foreign_col_names;/* names of the columns in the foreign key */ char* referenced_table_name;/* referenced table name */ dict_table_t* referenced_table;/* table where the referenced key is */ - char** referenced_col_names;/* names of the referenced + const char** referenced_col_names;/* names of the referenced columns in the referenced table */ ulint n_fields; /* number of indexes' first fields for which the the foreign key @@ -294,9 +296,16 @@ struct dict_table_struct{ dulint id; /* id of the table or cluster */ ulint type; /* DICT_TABLE_ORDINARY, ... */ mem_heap_t* heap; /* memory heap */ - char* name; /* table name */ + const char* name; /* table name */ ulint space; /* space where the clustered index of the table is placed */ + ibool ibd_file_missing;/* TRUE if this is in a single-table + tablespace and the .ibd file is missing; then + we must return in ha_innodb.cc an error if the + user tries to query such an orphaned table */ + ibool tablespace_discarded;/* this flag is set TRUE when the + user calls DISCARD TABLESPACE on this table, + and reset to FALSE in IMPORT TABLESPACE */ hash_node_t name_hash; /* hash chain node */ hash_node_t id_hash; /* hash chain node */ ulint n_def; /* number of columns defined so far */ @@ -355,7 +364,7 @@ struct dict_table_struct{ byte mix_id_buf[12]; /* mix id of a mixed table written in a compressed form */ - char* cluster_name; /* if the table is a member in a + const char* cluster_name; /* if the table is a member in a cluster, this is the name of the cluster */ /*----------------------*/ ibool does_not_fit_in_memory; diff --git a/innobase/include/dyn0dyn.ic b/innobase/include/dyn0dyn.ic index 787615cae09..b6c4808398b 100644 --- a/innobase/include/dyn0dyn.ic +++ b/innobase/include/dyn0dyn.ic @@ -7,7 +7,7 @@ Created 2/5/1996 Heikki Tuuri *******************************************************/ #define DYN_BLOCK_MAGIC_N 375767 -#define DYN_BLOCK_FULL_FLAG 0x1000000 +#define DYN_BLOCK_FULL_FLAG 0x1000000UL /**************************************************************** Adds a new block to a dyn array. */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index ef41ca21d2c..45549aee63c 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -17,10 +17,13 @@ Created 10/25/1995 Heikki Tuuri #include "os0file.h" /* When mysqld is run, the default directory "." is the mysqld datadir, but in -ibbackup we must set it explicitly; the path must NOT contain the trailing +ibbackup we must set it explicitly; the patgh must NOT contain the trailing '/' or '\' */ extern const char* fil_path_to_mysql_datadir; +/* Initial size of a single-table tablespace in pages */ +#define FIL_IBD_FILE_INITIAL_SIZE 4 + /* 'null' (undefined) page offset in the context of file spaces */ #define FIL_NULL ULINT32_UNDEFINED @@ -65,10 +68,8 @@ extern fil_addr_t fil_addr_null; first page in a data file: the file has been flushed to disk at least up to this lsn */ -#define FIL_PAGE_ARCH_LOG_NO 34 /* this is only defined for the - first page in a data file: the latest - archived log file number when the - flush lsn above was written */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /* starting from 4.1.x this + contains the space id of the page */ #define FIL_PAGE_DATA 38 /* start of the data on the page */ /* File page trailer */ @@ -91,50 +92,52 @@ extern fil_addr_t fil_addr_null; extern ulint fil_n_pending_log_flushes; extern ulint fil_n_pending_tablespace_flushes; + /*********************************************************************** -Reserves a right to open a single file. The right must be released with -fil_release_right_to_open. */ +Returns the version number of a tablespace, -1 if not found. */ -void -fil_reserve_right_to_open(void); -/*===========================*/ +ib_longlong +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id); /* in: space id */ /*********************************************************************** -Releases a right to open a single file. */ +Returns the latch of a file space. */ -void -fil_release_right_to_open(void); -/*===========================*/ -/************************************************************************ -Returns TRUE if file address is undefined. */ -ibool -fil_addr_is_null( -/*=============*/ - /* out: TRUE if undefined */ - fil_addr_t addr); /* in: address */ -/******************************************************************** -Initializes the file system of this module. */ +rw_lock_t* +fil_space_get_latch( +/*================*/ + /* out: latch protecting storage allocation */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the type of a file space. */ -void -fil_init( -/*=====*/ - ulint max_n_open); /* in: max number of open files */ -/******************************************************************** -Initializes the ibuf indexes at a database start. This can be called -after the file space headers have been created and the dictionary system -has been initialized. */ +ulint +fil_space_get_type( +/*===============*/ + /* out: FIL_TABLESPACE or FIL_LOG */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the ibuf data of a file space. */ -void -fil_ibuf_init_at_db_start(void); -/*===========================*/ +ibuf_data_t* +fil_space_get_ibuf_data( +/*====================*/ + /* out: ibuf data for this space */ + ulint id); /* in: space id */ /*********************************************************************** -Creates a space object and puts it to the file system. */ +Appends a new file to the chain of files of a space. File must be closed. */ void -fil_space_create( -/*=============*/ - char* name, /* in: space name */ - ulint id, /* in: space id */ - ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */ +fil_node_create( +/*============*/ + const char* name, /* in: file name (file must be closed) */ + ulint size, /* in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /* in: space id where to append */ + ibool is_raw);/* in: TRUE if a raw device or + a raw disk partition */ /******************************************************************** Drops files from the start of a file space, so that its size is cut by the amount given. */ @@ -146,48 +149,88 @@ fil_space_truncate_start( ulint trunc_len); /* in: truncate by this much; it is an error if this does not equal to the combined size of some initial files in the space */ -/************************************************************************** -Tries to extend a data file by the number of pages given. Any fractions of a -megabyte are ignored. */ +/*********************************************************************** +Creates a space memory object and puts it to the 'fil system' hash table. If +there is an error, prints an error message to the .err log. */ ibool -fil_extend_last_data_file( -/*======================*/ - /* out: TRUE if success, also if we run - out of disk space we may return TRUE */ - ulint* actual_increase,/* out: number of pages we were able to - extend, here the orginal size of the file and - the resulting size of the file are rounded - downwards to a full megabyte, and the - difference expressed in pages is returned */ - ulint size_increase); /* in: try to extend this many pages */ +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + const char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */ /*********************************************************************** -Frees a space object from a file system. Closes the files in the chain -but does not delete them. */ +Frees a space object from a the tablespace memory cache. Closes the files in +the chain but does not delete them. */ -void +ibool fil_space_free( /*===========*/ + /* out: TRUE if success */ ulint id); /* in: space id */ /*********************************************************************** -Returns the latch of a file space. */ - -rw_lock_t* -fil_space_get_latch( -/*================*/ - /* out: latch protecting storage allocation */ - ulint id); /* in: space id */ -/*********************************************************************** -Returns the type of a file space. */ +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ ulint -fil_space_get_type( +fil_space_get_size( /*===============*/ - /* out: FIL_TABLESPACE or FIL_LOG */ + /* out: space size, 0 if space not found */ ulint id); /* in: space id */ +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no);/* in: page number */ +/******************************************************************** +Initializes the tablespace memory cache. */ + +void +fil_init( +/*=====*/ + ulint max_n_open); /* in: max number of open files */ +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ + +void +fil_open_log_and_system_tablespace_files(void); +/*==========================================*/ +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ + +void +fil_close_all_files(void); +/*=====================*/ +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ + +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id);/* in: maximum known id */ +/******************************************************************** +Initializes the ibuf data structure for space 0 == the system tablespace. +This can be called after the file space headers have been created and the +dictionary system has been initialized. */ + +void +fil_ibuf_init_at_db_start(void); +/*===========================*/ /******************************************************************** Writes the flushed lsn and the latest archived log number to the page -header of the first page of each data file. */ +header of the first page of each data file in the system tablespace. */ ulint fil_write_flushed_lsn_to_data_files( @@ -205,53 +248,278 @@ fil_read_flushed_lsn_and_arch_log_no( os_file_t data_file, /* in: open data file */ ibool one_read_already, /* in: TRUE if min and max parameters below already contain sensible data */ - dulint* min_flushed_lsn, /* in/out: */ +#ifdef UNIV_LOG_ARCHIVE ulint* min_arch_log_no, /* in/out: */ - dulint* max_flushed_lsn, /* in/out: */ - ulint* max_arch_log_no); /* in/out: */ + ulint* max_arch_log_no, /* in/out: */ +#endif /* UNIV_LOG_ARCHIVE */ + dulint* min_flushed_lsn, /* in/out: */ + dulint* max_flushed_lsn); /* in/out: */ /*********************************************************************** -Returns the ibuf data of a file space. */ +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ -ibuf_data_t* -fil_space_get_ibuf_data( -/*====================*/ - /* out: ibuf data for this space */ +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ ulint id); /* in: space id */ /*********************************************************************** -Returns the size of the space in pages. */ +Decrements the count of pending insert buffer page merges. */ + +void +fil_decr_pending_ibuf_merges( +/*========================*/ + ulint id); /* in: space id */ +/*********************************************************************** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the +datadir that we should use in replaying the file operations. */ + +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + /* out: end of log record, or NULL if the + record was not completely contained between + ptr and end_ptr */ + byte* ptr, /* in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /* in: buffer end */ + ulint type, /* in: the type of this log record */ + ibool do_replay, /* in: TRUE if we want to replay the + operation, and not just parse the log record */ + ulint space_id); /* in: if do_replay is TRUE, the space id of + the tablespace in question; otherwise + ignored */ +/*********************************************************************** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ + +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ + +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ + +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + const char* old_name, /* in: old table name in the standard + databasename/tablename format of + InnoDB, or NULL if we do the rename + based on the space id only */ + ulint id, /* in: space id */ + const char* new_name); /* in: new table name in the standard + databasename/tablename format + of InnoDB */ +/*********************************************************************** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. */ ulint -fil_space_get_size( -/*===============*/ - /* out: space size */ +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* in/out: space id; if this is != 0, + then this is an input parameter, + otherwise output */ + const char* tablename, /* in: the table name in the usual + databasename/tablename format + of InnoDB */ + ulint size); /* in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ +/************************************************************************ +Tries to open a single-table tablespace and checks the space id is right in +it. If does not succeed, prints an error message to the .err log. This +function is used to open the tablespace when we load a table definition +to the dictionary cache. NOTE that we assume this operation is used under the +protection of the dictionary mutex, so that two users cannot race here. */ + +ibool +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ + ulint id, /* in: space id */ + const char* name); /* in: table name in the + databasename/tablename format */ +/************************************************************************ +It is possible, though very improbable, that the lsn's in the tablespace to be +imported have risen above the current system lsn, if a lengthy purge, ibuf +merge, or rollback was performed on a backup taken with ibbackup. If that is +the case, reset page lsn's in the file. We assume that mysqld was shut down +after it performed these cleanup operations on the .ibd file, so that it at +the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the +first page of the .ibd file, and we can determine whether we need to reset the +lsn's just by looking at that flush lsn. */ + +ibool +fil_reset_too_high_lsns( +/*====================*/ + /* out: TRUE if success */ + const char* name, /* in: table name in the + databasename/tablename format */ + dulint current_lsn); /* in: reset lsn's if the lsn stamped + to FIL_PAGE_FILE_FLUSH_LSN in the + first page is too high */ +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ + +ulint +fil_load_single_table_tablespaces(void); +/*===================================*/ + /* out: DB_SUCCESS or error number */ +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ + +void +fil_print_orphaned_tablespaces(void); +/*================================*/ +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ + +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_longlong version);/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +/*********************************************************************** +Returns TRUE if a single-table tablespace exists in the memory cache. */ + +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ ulint id); /* in: space id */ /*********************************************************************** -Checks if the pair space, page_no refers to an existing page in a -tablespace file space. */ +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ ibool -fil_check_adress_in_tablespace( +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + const char* name, /* in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /* in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist); + /* in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. */ + +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + /* out: TRUE if success */ + ulint* actual_size, /* out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /* in: space id, must be != 0 */ + ulint size_after_extend);/* in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +#ifdef UNIV_HOTBACKUP +/************************************************************************ +Extends all tablespaces to the size stored in the space header. During the +ibbackup --apply-log phase we extended the spaces on-demand so that log records +could be appllied, but that may have left spaces still too small compared to +the size stored in the space header. */ + +void +fil_extend_tablespaces_to_stored_len(void); +/*======================================*/ +#endif +/*********************************************************************** +Tries to reserve free extents in a file space. */ + +ibool +fil_space_reserve_free_extents( /*===========================*/ - /* out: TRUE if the address is meaningful */ - ulint id, /* in: space id */ - ulint page_no);/* in: page number */ + /* out: TRUE if succeed */ + ulint id, /* in: space id */ + ulint n_free_now, /* in: number of free extents now */ + ulint n_to_reserve); /* in: how many one wants to reserve */ /*********************************************************************** -Appends a new file to the chain of files of a space. -File must be closed. */ +Releases free extents in a file space. */ void -fil_node_create( -/*============*/ - char* name, /* in: file name (file must be closed) */ - ulint size, /* in: file size in database blocks, rounded downwards - to an integer */ - ulint id); /* in: space id where to append */ +fil_space_release_free_extents( +/*===========================*/ + ulint id, /* in: space id */ + ulint n_reserved); /* in: how many one reserved */ +/*********************************************************************** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ + +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id); /* in: space id */ /************************************************************************ Reads or writes data. This operation is asynchronous (aio). */ -void +ulint fil_io( /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, ORed to OS_FILE_LOG, if a log i/o and ORed to OS_AIO_SIMULATED_WAKE_LATER @@ -267,9 +535,9 @@ fil_io( ulint byte_offset, /* in: remainder of offset in bytes; in aio this must be divisible by the OS block size */ - ulint len, /* in: how many bytes to read; this must - not cross a file boundary; in aio this must - be a block size multiple */ + ulint len, /* in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ void* buf, /* in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ @@ -277,12 +545,15 @@ fil_io( aio used, else ignored */ /************************************************************************ Reads data from a space to a buffer. Remember that the possible incomplete -blocks at the end of a file are ignored: they are not taken into account when +blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_read( /*=====*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -297,12 +568,15 @@ fil_read( aio used, else ignored */ /************************************************************************ Writes data to a space from a buffer. Remember that the possible incomplete -blocks at the end of a file are ignored: they are not taken into account when +blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_write( /*======*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -327,7 +601,8 @@ fil_aio_wait( ulint segment); /* in: the number of the segment in the aio array to wait for */ /************************************************************************** -Flushes to disk possible writes cached by the OS. */ +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ void fil_flush( @@ -343,13 +618,21 @@ fil_flush_file_spaces( /*==================*/ ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */ /********************************************************************** -Checks the consistency of the file system. */ +Checks the consistency of the tablespace cache. */ ibool fil_validate(void); /*==============*/ /* out: TRUE if ok */ /************************************************************************ +Returns TRUE if file address is undefined. */ + +ibool +fil_addr_is_null( +/*=============*/ + /* out: TRUE if undefined */ + fil_addr_t addr); /* in: address */ +/************************************************************************ Accessor functions for a file page */ ulint @@ -373,32 +656,7 @@ fil_page_get_type( /* out: type; NOTE that if the type has not been written to page, the return value not defined */ byte* page); /* in: file page */ -/*********************************************************************** -Tries to reserve free extents in a file space. */ - -ibool -fil_space_reserve_free_extents( -/*===========================*/ - /* out: TRUE if succeed */ - ulint id, /* in: space id */ - ulint n_free_now, /* in: number of free extents now */ - ulint n_to_reserve); /* in: how many one wants to reserve */ -/*********************************************************************** -Releases free extents in a file space. */ -void -fil_space_release_free_extents( -/*===========================*/ - ulint id, /* in: space id */ - ulint n_reserved); /* in: how many one reserved */ -/*********************************************************************** -Gets the number of reserved extents. If the database is silent, this number -should be zero. */ - -ulint -fil_space_get_n_reserved_extents( -/*=============================*/ - ulint id); /* in: space id */ typedef struct fil_space_struct fil_space_t; diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h index 3494f336b1e..2fcde882df7 100644 --- a/innobase/include/fsp0fsp.h +++ b/innobase/include/fsp0fsp.h @@ -55,7 +55,7 @@ ulint fsp_header_get_free_limit( /*======================*/ /* out: free limit in megabytes */ - ulint space); /* in: space id */ + ulint space); /* in: space id, must be 0 */ /************************************************************************** Gets the size of the tablespace from the tablespace header. If we do not have an auto-extending data file, this should be equal to the size of the @@ -65,9 +65,35 @@ ulint fsp_header_get_tablespace_size( /*===========================*/ /* out: size in pages */ - ulint space); /* in: space id */ + ulint space); /* in: space id, must be 0 */ /************************************************************************** -Initializes the space header of a new created space. */ +Reads the file space size stored in the header page. */ + +ulint +fsp_get_size_low( +/*=============*/ + /* out: tablespace size stored in the space header */ + page_t* page); /* in: header page (page 0 in the tablespace) */ +/************************************************************************** +Reads the space id from the first page of a tablespace. */ + +ulint +fsp_header_get_space_id( +/*====================*/ + /* out: space id, ULINT UNDEFINED if error */ + page_t* page); /* in: first page of a tablespace */ +/************************************************************************** +Writes the space id to a tablespace header. This function is used past the +buffer pool when we in fil0fil.c create a new single-table tablespace. */ + +void +fsp_header_write_space_id( +/*======================*/ + page_t* page, /* in: first page in the space */ + ulint space_id); /* in: space id */ +/************************************************************************** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ void fsp_header_init( @@ -117,12 +143,12 @@ fseg_create_general( will belong to the created segment */ ulint byte_offset, /* in: byte offset of the created segment header on the page */ - ibool has_done_reservation, /* in: TRUE if the caller has - already done the reservation for the pages - with fsp_reserve_free_extents (at least 2 extents: - one for the inode and, then there other for the - segment) is no need to do the check for this - individual operation */ + ibool has_done_reservation, /* in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ mtr_t* mtr); /* in: mtr */ /************************************************************************** Calculates the number of pages reserved by a segment, and how many pages are @@ -194,12 +220,21 @@ two types of allocation: when space is scarce, FSP_NORMAL allocations will not succeed, but the latter two allocations will succeed, if possible. The purpose is to avoid dead end where the database is full but the user cannot free any space because these freeing operations temporarily -reserve some space. */ +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. */ ibool fsp_reserve_free_extents( /*=====================*/ /* out: TRUE if we were able to make the reservation */ + ulint* n_reserved,/* out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ ulint space, /* in: space id */ ulint n_ext, /* in: number of extents to reserve */ ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ @@ -337,8 +372,8 @@ pages: */ #define FSP_FIRST_INODE_PAGE_NO 2 #define FSP_IBUF_HEADER_PAGE_NO 3 #define FSP_IBUF_TREE_ROOT_PAGE_NO 4 - /* The ibuf tree root page number in each - tablespace; its fseg inode is on the page + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page number FSP_FIRST_INODE_PAGE_NO */ #define FSP_TRX_SYS_PAGE_NO 5 #define FSP_FIRST_RSEG_PAGE_NO 6 diff --git a/innobase/include/fut0lst.ic b/innobase/include/fut0lst.ic index d2e79cf7640..c0d61833b48 100644 --- a/innobase/include/fut0lst.ic +++ b/innobase/include/fut0lst.ic @@ -23,7 +23,7 @@ Created 11/28/1995 Heikki Tuuri #define FLST_FIRST 4 /* 6-byte address of the first element of the list; undefined if empty list */ #define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the - first element of the list; undefined + last element of the list; undefined if empty list */ /************************************************************************ diff --git a/innobase/include/ha0ha.h b/innobase/include/ha0ha.h index 908db0a6f26..bdaecfcc57a 100644 --- a/innobase/include/ha0ha.h +++ b/innobase/include/ha0ha.h @@ -28,7 +28,7 @@ ha_search_and_get_data( /************************************************************* Looks for an element when we know the pointer to the data and updates the pointer to data if found. */ -UNIV_INLINE + void ha_search_and_update_if_found( /*==========================*/ diff --git a/innobase/include/ha0ha.ic b/innobase/include/ha0ha.ic index 2f02f6bdb4e..63cd19fafc3 100644 --- a/innobase/include/ha0ha.ic +++ b/innobase/include/ha0ha.ic @@ -164,31 +164,6 @@ ha_search_with_data( } /************************************************************* -Looks for an element when we know the pointer to the data, and updates -the pointer to data, if found. */ -UNIV_INLINE -void -ha_search_and_update_if_found( -/*==========================*/ - hash_table_t* table, /* in: hash table */ - ulint fold, /* in: folded value of the searched data */ - void* data, /* in: pointer to the data */ - void* new_data)/* in: new pointer to the data */ -{ - ha_node_t* node; - -#ifdef UNIV_SYNC_DEBUG - ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); -#endif /* UNIV_SYNC_DEBUG */ - - node = ha_search_with_data(table, fold, data); - - if (node) { - node->data = new_data; - } -} - -/************************************************************* Looks for an element when we know the pointer to the data, and deletes it from the hash table, if found. */ UNIV_INLINE diff --git a/innobase/include/hash0hash.h b/innobase/include/hash0hash.h index 101cb5d434f..51315e40875 100644 --- a/innobase/include/hash0hash.h +++ b/innobase/include/hash0hash.h @@ -109,7 +109,7 @@ do {\ \ while (struct3333->NAME != DATA) {\ \ - ut_ad(struct3333);\ + ut_a(struct3333);\ struct3333 = struct3333->NAME;\ }\ \ @@ -305,6 +305,8 @@ struct hash_cell_struct{ /* The hash table structure */ struct hash_table_struct { + ibool adaptive;/* TRUE if this is the hash table of the + adaptive hash index */ ulint n_cells;/* number of cells in the hash table */ hash_cell_t* array; /* pointer to cell array */ ulint n_mutexes;/* if mutexes != NULL, then the number of diff --git a/innobase/include/ibuf0ibuf.h b/innobase/include/ibuf0ibuf.h index 80958a593cb..4f38ab4f1e9 100644 --- a/innobase/include/ibuf0ibuf.h +++ b/innobase/include/ibuf0ibuf.h @@ -40,6 +40,13 @@ void ibuf_init_at_db_start(void); /*=======================*/ /************************************************************************* +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ + +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/************************************************************************* Initializes an ibuf bitmap page. */ void @@ -198,8 +205,8 @@ When an index page is read from a disk to the buffer pool, this function inserts to the page the possible index entries buffered in the insert buffer. The entries are deleted from the insert buffer. If the page is not read, but created in the buffer pool, this function deletes its buffered entries from -the insert buffer; note that there can exist entries if the page belonged to -an index which was dropped. */ +the insert buffer; there can exist entries for such a page if the page +belonged to an index which subsequently was dropped. */ void ibuf_merge_or_delete_for_page( @@ -207,7 +214,21 @@ ibuf_merge_or_delete_for_page( page_t* page, /* in: if page has been read from disk, pointer to the page x-latched, else NULL */ ulint space, /* in: space id of the index page */ - ulint page_no);/* in: page number of the index page */ + ulint page_no,/* in: page number of the index page */ + ibool update_ibuf_bitmap);/* in: normally this is set to TRUE, but if + we have deleted or are deleting the tablespace, then we + naturally do not want to update a non-existent bitmap + page */ +/************************************************************************* +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ + +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space); /* in: space id */ /************************************************************************* Contracts insert buffer trees by reading pages to the buffer pool. */ @@ -257,6 +278,13 @@ ibuf_count_get( ulint space, /* in: space id */ ulint page_no);/* in: page number */ /********************************************************************** +Looks if the insert buffer is empty. */ + +ibool +ibuf_is_empty(void); +/*===============*/ + /* out: TRUE if empty */ +/********************************************************************** Prints info of ibuf. */ void diff --git a/innobase/include/ibuf0ibuf.ic b/innobase/include/ibuf0ibuf.ic index 0886c8c02cc..68f7ce9c1d0 100644 --- a/innobase/include/ibuf0ibuf.ic +++ b/innobase/include/ibuf0ibuf.ic @@ -218,7 +218,7 @@ ibuf_update_free_bits_if_full( } if (after == 0) { - /* We move the page to front of the buffer pool LRU list: + /* We move the page to the front of the buffer pool LRU list: the purpose of this is to prevent those pages to which we cannot make inserts using the insert buffer from slipping out of the buffer pool */ diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h index 49045e2ed7a..d14a116072d 100644 --- a/innobase/include/log0log.h +++ b/innobase/include/log0log.h @@ -489,9 +489,9 @@ Peeks the current lsn. */ ibool log_peek_lsn( /*=========*/ - /* out: TRUE if success, FALSE if could not get the - log system mutex */ - dulint* lsn); /* out: if returns TRUE, current lsn is here */ + /* out: TRUE if success, FALSE if could not get the + log system mutex */ + dulint* lsn); /* out: if returns TRUE, current lsn is here */ /************************************************************************** Refreshes the statistics used to print per-second averages. */ @@ -570,12 +570,18 @@ extern log_t* log_sys; #define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END #define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END) #define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END) - /* current fsp free limit in the - tablespace, in units of one megabyte */ + /* current fsp free limit in + tablespace 0, in units of one + megabyte; this information is only used + by ibbackup to decide if it can + truncate unused ends of + non-auto-extending data files in space + 0 */ #define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END) /* this magic number tells if the checkpoint contains the above field: - the field was added to InnoDB-3.23.50 */ + the field was added to + InnoDB-3.23.50 */ #define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END) #define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243 @@ -764,11 +770,11 @@ struct log_struct{ called */ /* Fields involved in checkpoints */ - ulint log_group_capacity; /* capacity of the log group; if - the checkpoint age exceeds this, it is - a serious error because it is possible - we will then overwrite log and spoil - crash recovery */ + ulint log_group_capacity; /* capacity of the log group; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ ulint max_modified_age_async; /* when this recommended value for lsn - buf_pool_get_oldest_modification() @@ -810,7 +816,8 @@ struct log_struct{ /* Fields involved in archiving */ ulint archiving_state;/* LOG_ARCH_ON, LOG_ARCH_STOPPING LOG_ARCH_STOPPED, LOG_ARCH_OFF */ - dulint archived_lsn; /* archiving has advanced to this lsn */ + dulint archived_lsn; /* archiving has advanced to this + lsn */ ulint max_archived_lsn_age_async; /* recommended maximum age of archived_lsn, before we start diff --git a/innobase/include/log0log.ic b/innobase/include/log0log.ic index 910cce88639..ca7531783a2 100644 --- a/innobase/include/log0log.ic +++ b/innobase/include/log0log.ic @@ -182,9 +182,9 @@ log_block_convert_lsn_to_no( no = ut_dulint_get_low(lsn) / OS_FILE_LOG_BLOCK_SIZE; no += (ut_dulint_get_high(lsn) % OS_FILE_LOG_BLOCK_SIZE) - * 2 * (0x80000000 / OS_FILE_LOG_BLOCK_SIZE); + * 2 * (0x80000000UL / OS_FILE_LOG_BLOCK_SIZE); - no = no & 0x3FFFFFFF; + no = no & 0x3FFFFFFFUL; return(no + 1); } @@ -206,7 +206,7 @@ log_block_calc_checksum( sh = 0; for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { - sum = sum & 0x7FFFFFFF; + sum = sum & 0x7FFFFFFFUL; sum += (((ulint)(*(block + i))) << sh) + (ulint)(*(block + i)); sh++; if (sh > 24) { @@ -349,7 +349,7 @@ log_reserve_and_write_fast( #ifdef UNIV_LOG_DEBUG log_check_log_recs(log->buf + log->old_buf_free, - log->buf_free - log->old_buf_free, log->old_lsn); + log->buf_free - log->old_buf_free, log->old_lsn); #endif return(lsn); } diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h index e5a5bc05563..658df4d5586 100644 --- a/innobase/include/log0recv.h +++ b/innobase/include/log0recv.h @@ -15,6 +15,10 @@ Created 9/20/1997 Heikki Tuuri #include "hash0hash.h" #include "log0log.h" +#ifdef UNIV_HOTBACKUP +extern ibool recv_replay_file_ops; +#endif /* UNIV_HOTBACKUP */ + /*********************************************************************** Reads the checkpoint info needed in hot backup. */ @@ -25,8 +29,8 @@ recv_read_cp_info_for_backup( byte* hdr, /* in: buffer containing the log group header */ dulint* lsn, /* out: checkpoint lsn */ ulint* offset, /* out: checkpoint offset in the log group */ - ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database - is running with < version 3.23.50 of InnoDB */ + ulint* fsp_limit,/* out: fsp limit of space 0, 1000000000 if the + database is running with < version 3.23.50 of InnoDB */ dulint* cp_no, /* out: checkpoint number */ dulint* first_header_lsn); /* out: lsn of of the start of the first log file */ @@ -132,20 +136,25 @@ recv_reset_logs( dulint lsn, /* in: reset to this lsn rounded up to be divisible by OS_FILE_LOG_BLOCK_SIZE, after which we add LOG_BLOCK_HDR_SIZE */ +#ifdef UNIV_LOG_ARCHIVE ulint arch_log_no, /* in: next archived log file number */ +#endif /* UNIV_LOG_ARCHIVE */ ibool new_logs_created);/* in: TRUE if resetting logs is done at the log creation; FALSE if it is done after archive recovery */ +#ifdef UNIV_HOTBACKUP /********************************************************** Creates new log files after a backup has been restored. */ void recv_reset_log_files_for_backup( /*============================*/ - char* log_dir, /* in: log file directory path */ - ulint n_log_files, /* in: number of log files */ - ulint log_file_size, /* in: log file size */ - dulint lsn); /* in: new start lsn */ + const char* log_dir, /* in: log file directory path */ + ulint n_log_files, /* in: number of log files */ + ulint log_file_size, /* in: log file size */ + dulint lsn); /* in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ +#endif /* UNIV_HOTBACKUP */ /************************************************************ Creates the recovery system. */ @@ -175,17 +184,15 @@ recv_apply_hashed_log_recs( disk and invalidated in buffer pool: this alternative means that no new log records can be generated during the application */ +#ifdef UNIV_HOTBACKUP /*********************************************************************** Applies log records in the hash table to a backup. */ void -recv_apply_log_recs_for_backup( -/*===========================*/ - ulint n_data_files, /* in: number of data files */ - char** data_files, /* in: array containing the paths to the - data files */ - ulint* file_sizes); /* in: sizes of the data files in database - pages */ +recv_apply_log_recs_for_backup(void); +/*================================*/ +#endif +#ifdef UNIV_LOG_ARCHIVE /************************************************************ Recovers from archived log files, and also from log files, if they exist. */ @@ -206,6 +213,7 @@ Completes recovery from archive. */ void recv_recovery_from_archive_finish(void); /*===================================*/ +#endif /* UNIV_LOG_ARCHIVE */ /*********************************************************************** Checks that a replica of a space is identical to the original space. */ @@ -334,8 +342,9 @@ extern ibool recv_no_ibuf_operations; extern ibool recv_needed_recovery; extern ibool recv_lsn_checks_on; - +#ifdef UNIV_HOTBACKUP extern ibool recv_is_making_a_backup; +#endif /* UNIV_HOTBACKUP */ extern ulint recv_max_parsed_page_no; /* Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many @@ -357,12 +366,7 @@ in the debug version: spaces with an odd number as the id are replicate spaces */ #define RECV_REPLICA_SPACE_ADD 1 -/* This many blocks must be left free in the buffer pool when we scan -the log and store the scanned log records in the buffer pool: we will -use these free blocks to read in pages when we start applying the -log records to the database. */ - -#define RECV_POOL_N_FREE_BLOCKS (ut_min(256, buf_pool_get_curr_size() / 8)) +extern ulint recv_n_pool_free_frames; #ifndef UNIV_NONINL #include "log0recv.ic" diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic index 5073404b86a..3ffb9baa344 100644 --- a/innobase/include/mach0data.ic +++ b/innobase/include/mach0data.ic @@ -17,7 +17,7 @@ mach_write_to_1( ulint n) /* in: ulint integer to be stored, >= 0, < 256 */ { ut_ad(b); - ut_ad(n <= 0xFF); + ut_ad(n <= 0xFFUL); b[0] = (byte)n; } @@ -46,7 +46,7 @@ mach_write_to_2( ulint n) /* in: ulint integer to be stored */ { ut_ad(b); - ut_ad(n <= 0xFFFF); + ut_ad(n <= 0xFFFFUL); b[0] = (byte)(n >> 8); b[1] = (byte)(n); @@ -79,7 +79,7 @@ mach_write_to_3( ulint n) /* in: ulint integer to be stored */ { ut_ad(b); - ut_ad(n <= 0xFFFFFF); + ut_ad(n <= 0xFFFFFFUL); b[0] = (byte)(n >> 16); b[1] = (byte)(n >> 8); @@ -184,20 +184,20 @@ mach_write_compressed( { ut_ad(b); - if (n < 0x80) { + if (n < 0x80UL) { mach_write_to_1(b, n); return(1); - } else if (n < 0x4000) { - mach_write_to_2(b, n | 0x8000); + } else if (n < 0x4000UL) { + mach_write_to_2(b, n | 0x8000UL); return(2); - } else if (n < 0x200000) { - mach_write_to_3(b, n | 0xC00000); + } else if (n < 0x200000UL) { + mach_write_to_3(b, n | 0xC00000UL); return(3); - } else if (n < 0x10000000) { - mach_write_to_4(b, n | 0xE0000000); + } else if (n < 0x10000000UL) { + mach_write_to_4(b, n | 0xE0000000UL); return(4); } else { - mach_write_to_1(b, 0xF0); + mach_write_to_1(b, 0xF0UL); mach_write_to_4(b + 1, n); return(5); } @@ -212,13 +212,13 @@ mach_get_compressed_size( /* out: compressed size in bytes */ ulint n) /* in: ulint integer (< 2^32) to be stored */ { - if (n < 0x80) { + if (n < 0x80UL) { return(1); - } else if (n < 0x4000) { + } else if (n < 0x4000UL) { return(2); - } else if (n < 0x200000) { + } else if (n < 0x200000UL) { return(3); - } else if (n < 0x10000000) { + } else if (n < 0x10000000UL) { return(4); } else { return(5); @@ -240,16 +240,16 @@ mach_read_compressed( flag = mach_read_from_1(b); - if (flag < 0x80) { + if (flag < 0x80UL) { return(flag); - } else if (flag < 0xC0) { - return(mach_read_from_2(b) & 0x7FFF); - } else if (flag < 0xE0) { - return(mach_read_from_3(b) & 0x3FFFFF); - } else if (flag < 0xF0) { - return(mach_read_from_4(b) & 0x1FFFFFFF); + } else if (flag < 0xC0UL) { + return(mach_read_from_2(b) & 0x7FFFUL); + } else if (flag < 0xE0UL) { + return(mach_read_from_3(b) & 0x3FFFFFUL); + } else if (flag < 0xF0UL) { + return(mach_read_from_4(b) & 0x1FFFFFFFUL); } else { - ut_ad(flag == 0xF0); + ut_ad(flag == 0xF0UL); return(mach_read_from_4(b + 1)); } } @@ -439,7 +439,7 @@ mach_dulint_write_much_compressed( return(mach_write_compressed(b, ut_dulint_get_low(n))); } - *b = 0xFF; + *b = (byte)0xFF; size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n)); size += mach_write_compressed(b + size, ut_dulint_get_low(n)); @@ -479,7 +479,7 @@ mach_dulint_read_much_compressed( ut_ad(b); - if (*b != 0xFF) { + if (*b != (byte)0xFF) { high = 0; size = 0; } else { @@ -679,11 +679,10 @@ mach_write_to_2_little_endian( { ut_ad(n < 256 * 256); - *dest = (byte)(n & 0xFF); + *dest = (byte)(n & 0xFFUL); n = n >> 8; dest++; - *dest = (byte)(n & 0xFF); + *dest = (byte)(n & 0xFFUL); } - diff --git a/innobase/include/mem0dbg.ic b/innobase/include/mem0dbg.ic index 6efac719760..7ce5f6f1ba5 100644 --- a/innobase/include/mem0dbg.ic +++ b/innobase/include/mem0dbg.ic @@ -54,7 +54,7 @@ void mem_hash_insert( /*============*/ mem_heap_t* heap, /* in: the created heap */ - char* file_name, /* in: file name of creation */ + const char* file_name, /* in: file name of creation */ ulint line); /* in: line where created */ /******************************************************************* Removes a memory heap (which is going to be freed by the caller) @@ -69,7 +69,7 @@ void mem_hash_remove( /*============*/ mem_heap_t* heap, /* in: the heap to be freed */ - char* file_name, /* in: file name of freeing */ + const char* file_name, /* in: file name of freeing */ ulint line); /* in: line where freed */ diff --git a/innobase/include/mem0mem.h b/innobase/include/mem0mem.h index 2dc5a111173..cd01ac77bf3 100644 --- a/innobase/include/mem0mem.h +++ b/innobase/include/mem0mem.h @@ -64,14 +64,14 @@ heap creation. */ #define mem_heap_create(N) mem_heap_create_func(\ (N), NULL, MEM_HEAP_DYNAMIC,\ - IB__FILE__, __LINE__) + __FILE__, __LINE__) /****************************************************************** Use this macro instead of the corresponding function! Macro for memory heap creation. */ #define mem_heap_create_in_buffer(N) mem_heap_create_func(\ (N), NULL, MEM_HEAP_BUFFER,\ - IB__FILE__, __LINE__) + __FILE__, __LINE__) /****************************************************************** Use this macro instead of the corresponding function! Macro for memory heap creation. */ @@ -79,7 +79,7 @@ heap creation. */ #define mem_heap_create_in_btr_search(N) mem_heap_create_func(\ (N), NULL, MEM_HEAP_BTR_SEARCH |\ MEM_HEAP_BUFFER,\ - IB__FILE__, __LINE__) + __FILE__, __LINE__) /****************************************************************** Use this macro instead of the corresponding function! Macro for fast memory heap creation. An initial block of memory B is given by the @@ -88,14 +88,14 @@ mem_heap_free. See the parameter comment in mem_heap_create_func below. */ #define mem_heap_fast_create(N, B) mem_heap_create_func(\ (N), (B), MEM_HEAP_DYNAMIC,\ - IB__FILE__, __LINE__) + __FILE__, __LINE__) /****************************************************************** Use this macro instead of the corresponding function! Macro for memory heap freeing. */ #define mem_heap_free(heap) mem_heap_free_func(\ - (heap), IB__FILE__, __LINE__) + (heap), __FILE__, __LINE__) /********************************************************************* NOTE: Use the corresponding macros instead of this function. Creates a memory heap which allocates memory from dynamic space. For debugging @@ -105,26 +105,27 @@ UNIV_INLINE mem_heap_t* mem_heap_create_func( /*=================*/ - /* out, own: memory heap */ - ulint n, /* in: desired start block size, - this means that a single user buffer - of size n will fit in the block, - 0 creates a default size block; - if init_block is not NULL, n tells - its size in bytes */ - void* init_block, /* in: if very fast creation is - wanted, the caller can reserve some - memory from its stack, for example, - and pass it as the the initial block - to the heap: then no OS call of malloc - is needed at the creation. CAUTION: - the caller must make sure the initial - block is not unintentionally erased - (if allocated in the stack), before - the memory heap is explicitly freed. */ - ulint type, /* in: MEM_HEAP_DYNAMIC or MEM_HEAP_BUFFER */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + /* out, own: memory heap */ + ulint n, /* in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block; + if init_block is not NULL, n tells + its size in bytes */ + void* init_block, /* in: if very fast creation is + wanted, the caller can reserve some + memory from its stack, for example, + and pass it as the the initial block + to the heap: then no OS call of malloc + is needed at the creation. CAUTION: + the caller must make sure the initial + block is not unintentionally erased + (if allocated in the stack), before + the memory heap is explicitly freed. */ + ulint type, /* in: MEM_HEAP_DYNAMIC + or MEM_HEAP_BUFFER */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ); /********************************************************************* NOTE: Use the corresponding macro instead of this function. Frees the space @@ -135,7 +136,7 @@ void mem_heap_free_func( /*===============*/ mem_heap_t* heap, /* in, own: heap to be freed */ - char* file_name __attribute__((unused)), + const char* file_name __attribute__((unused)), /* in: file name where freed */ ulint line __attribute__((unused))); /* in: line where freed */ @@ -206,13 +207,13 @@ mem_heap_get_size( Use this macro instead of the corresponding function! Macro for memory buffer allocation */ -#define mem_alloc(N) mem_alloc_func((N), IB__FILE__, __LINE__) +#define mem_alloc(N) mem_alloc_func((N), __FILE__, __LINE__) /****************************************************************** Use this macro instead of the corresponding function! Macro for memory buffer allocation */ #define mem_alloc_noninline(N) mem_alloc_func_noninline(\ - (N), IB__FILE__, __LINE__) + (N), __FILE__, __LINE__) /******************************************************************* NOTE: Use the corresponding macro instead of this function. Allocates a single buffer of memory from the dynamic memory of @@ -222,11 +223,11 @@ UNIV_INLINE void* mem_alloc_func( /*===========*/ - /* out, own: free storage, NULL - if did not succeed */ - ulint n, /* in: desired number of bytes */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + /* out, own: free storage, NULL + if did not succeed */ + ulint n, /* in: desired number of bytes */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ); /******************************************************************* NOTE: Use the corresponding macro instead of this function. @@ -237,17 +238,17 @@ with mem_free. */ void* mem_alloc_func_noninline( /*=====================*/ - /* out, own: free storage, NULL if did not - succeed */ - ulint n, /* in: desired number of bytes */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + /* out, own: free storage, + NULL if did not succeed */ + ulint n, /* in: desired number of bytes */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ); /****************************************************************** Use this macro instead of the corresponding function! Macro for memory buffer freeing */ -#define mem_free(PTR) mem_free_func((PTR), IB__FILE__, __LINE__) +#define mem_free(PTR) mem_free_func((PTR), __FILE__, __LINE__) /******************************************************************* NOTE: Use the corresponding macro instead of this function. Frees a single buffer of storage from @@ -256,10 +257,11 @@ UNIV_INLINE void mem_free_func( /*==========*/ - void* ptr, /* in, own: buffer to be freed */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + void* ptr, /* in, own: buffer to be freed */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ); + /************************************************************************** Duplicates a NUL-terminated string. */ UNIV_INLINE diff --git a/innobase/include/mem0mem.ic b/innobase/include/mem0mem.ic index 7ae19d0f31c..d97b7d6c4dd 100644 --- a/innobase/include/mem0mem.ic +++ b/innobase/include/mem0mem.ic @@ -16,18 +16,18 @@ Creates a memory heap block where data can be allocated. */ mem_block_t* mem_heap_create_block( /*==================*/ - /* out, own: memory heap block, NULL if did not - succeed */ - mem_heap_t* heap,/* in: memory heap or NULL if first block should - be created */ - ulint n, /* in: number of bytes needed for user data, or - if init_block is not NULL, its size in bytes */ - void* init_block, /* in: init block in fast create, type must be - MEM_HEAP_DYNAMIC */ - ulint type, /* in: type of heap: MEM_HEAP_DYNAMIC or - MEM_HEAP_BUFFER */ - char* file_name,/* in: file name where created */ - ulint line); /* in: line where created */ + /* out, own: memory heap block, + NULL if did not succeed */ + mem_heap_t* heap, /* in: memory heap or NULL if first block + should be created */ + ulint n, /* in: number of bytes needed for user data, or + if init_block is not NULL, its size in bytes */ + void* init_block, /* in: init block in fast create, + type must be MEM_HEAP_DYNAMIC */ + ulint type, /* in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ + const char* file_name,/* in: file name where created */ + ulint line); /* in: line where created */ /********************************************************************** Frees a block from a memory heap. */ @@ -377,27 +377,27 @@ UNIV_INLINE mem_heap_t* mem_heap_create_func( /*=================*/ - /* out, own: memory heap */ - ulint n, /* in: desired start block size, - this means that a single user buffer - of size n will fit in the block, - 0 creates a default size block; - if init_block is not NULL, n tells - its size in bytes */ - void* init_block, /* in: if very fast creation is - wanted, the caller can reserve some - memory from its stack, for example, - and pass it as the the initial block - to the heap: then no OS call of malloc - is needed at the creation. CAUTION: - the caller must make sure the initial - block is not unintentionally erased - (if allocated in the stack), before - the memory heap is explicitly freed. */ - ulint type, /* in: MEM_HEAP_DYNAMIC, or MEM_HEAP_BUFFER - possibly ORed to MEM_HEAP_BTR_SEARCH */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + /* out, own: memory heap */ + ulint n, /* in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block; + if init_block is not NULL, n tells + its size in bytes */ + void* init_block, /* in: if very fast creation is + wanted, the caller can reserve some + memory from its stack, for example, + and pass it as the the initial block + to the heap: then no OS call of malloc + is needed at the creation. CAUTION: + the caller must make sure the initial + block is not unintentionally erased + (if allocated in the stack), before + the memory heap is explicitly freed. */ + ulint type, /* in: MEM_HEAP_DYNAMIC + or MEM_HEAP_BUFFER */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ) { mem_block_t* block; @@ -440,10 +440,9 @@ void mem_heap_free_func( /*===============*/ mem_heap_t* heap, /* in, own: heap to be freed */ - char* file_name __attribute__((unused)), + const char* file_name __attribute__((unused)), /* in: file name where freed */ ulint line __attribute__((unused))) - /* in: line where freed */ { mem_block_t* block; mem_block_t* prev_block; @@ -486,11 +485,11 @@ UNIV_INLINE void* mem_alloc_func( /*===========*/ - /* out, own: free storage, NULL if did not - succeed */ - ulint n, /* in: desired number of bytes */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + /* out, own: free storage, NULL + if did not succeed */ + ulint n, /* in: desired number of bytes */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ) { mem_heap_t* heap; @@ -523,9 +522,9 @@ UNIV_INLINE void mem_free_func( /*==========*/ - void* ptr, /* in, own: buffer to be freed */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + void* ptr, /* in, own: buffer to be freed */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ) { mem_heap_t* heap; diff --git a/innobase/include/mtr0log.h b/innobase/include/mtr0log.h index 785985dea16..41be168a371 100644 --- a/innobase/include/mtr0log.h +++ b/innobase/include/mtr0log.h @@ -57,6 +57,19 @@ mlog_write_initial_log_record( byte type, /* in: log item type: MLOG_1BYTE, ... */ mtr_t* mtr); /* in: mini-transaction handle */ /************************************************************ +Writes a log record about an .ibd file create/delete/rename. */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + /* out: new value of log_ptr */ + ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/* in: space id, if applicable */ + ulint page_no,/* in: page number (not relevant currently) */ + byte* log_ptr,/* in: pointer to mtr log which has been opened */ + mtr_t* mtr); /* in: mtr */ +/************************************************************ Catenates 1 - 4 bytes to the mtr log. */ UNIV_INLINE void diff --git a/innobase/include/mtr0log.ic b/innobase/include/mtr0log.ic index b0392e214f1..aa3f945c202 100644 --- a/innobase/include/mtr0log.ic +++ b/innobase/include/mtr0log.ic @@ -165,13 +165,6 @@ mlog_write_initial_log_record_fast( space = buf_block_get_space(block); offset = buf_block_get_page_no(block); - if (space != 0 || offset > 0x8FFFFFFF) { - fprintf(stderr, - "InnoDB: error: buffer page pointer %p has nonsensical space id %lu\n" - "InnoDB: or page no %lu\n", ptr, space, offset); - ut_error; - } - mach_write_to_1(log_ptr, type); log_ptr++; log_ptr += mach_write_compressed(log_ptr, space); @@ -195,3 +188,31 @@ mlog_write_initial_log_record_fast( #endif return(log_ptr); } + +/************************************************************ +Writes a log record about an .ibd file create/delete/rename. */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + /* out: new value of log_ptr */ + ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/* in: space id, if applicable */ + ulint page_no,/* in: page number (not relevant currently) */ + byte* log_ptr,/* in: pointer to mtr log which has been opened */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(log_ptr); + + mach_write_to_1(log_ptr, type); + log_ptr++; + + /* We write dummy space id and page number */ + log_ptr += mach_write_compressed(log_ptr, space_id); + log_ptr += mach_write_compressed(log_ptr, page_no); + + mtr->n_log_recs++; + + return(log_ptr); +} diff --git a/innobase/include/mtr0mtr.h b/innobase/include/mtr0mtr.h index 5e22ad0c598..e8c68a91dad 100644 --- a/innobase/include/mtr0mtr.h +++ b/innobase/include/mtr0mtr.h @@ -96,7 +96,13 @@ flag value must give the length also! */ sequence of these records */ #define MLOG_DUMMY_RECORD ((byte)32) /* dummy log record used to pad a log block full */ -#define MLOG_BIGGEST_TYPE ((byte)32) /* biggest value (used in +#define MLOG_FILE_CREATE ((byte)33) /* log record about an .ibd + file creation */ +#define MLOG_FILE_RENAME ((byte)34) /* log record about an .ibd + file rename */ +#define MLOG_FILE_DELETE ((byte)35) /* log record about an .ibd + file deletion */ +#define MLOG_BIGGEST_TYPE ((byte)35) /* biggest value (used in asserts) */ /******************************************************************* @@ -192,11 +198,11 @@ mtr_read_dulint( mtr_t* mtr); /* in: mini-transaction handle */ /************************************************************************* This macro locks an rw-lock in s-mode. */ -#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), IB__FILE__, __LINE__,\ +#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\ (MTR)) /************************************************************************* This macro locks an rw-lock in x-mode. */ -#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), IB__FILE__, __LINE__,\ +#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\ (MTR)) /************************************************************************* NOTE! Use the macro above! @@ -206,7 +212,7 @@ void mtr_s_lock_func( /*============*/ rw_lock_t* lock, /* in: rw-lock */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line number */ mtr_t* mtr); /* in: mtr */ /************************************************************************* @@ -217,7 +223,7 @@ void mtr_x_lock_func( /*============*/ rw_lock_t* lock, /* in: rw-lock */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line number */ mtr_t* mtr); /* in: mtr */ diff --git a/innobase/include/mtr0mtr.ic b/innobase/include/mtr0mtr.ic index 51112fc0d14..4fc6dd2f6a9 100644 --- a/innobase/include/mtr0mtr.ic +++ b/innobase/include/mtr0mtr.ic @@ -217,7 +217,7 @@ void mtr_s_lock_func( /*============*/ rw_lock_t* lock, /* in: rw-lock */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line number */ mtr_t* mtr) /* in: mtr */ { @@ -236,7 +236,7 @@ void mtr_x_lock_func( /*============*/ rw_lock_t* lock, /* in: rw-lock */ - char* file, /* in: file name */ + const char* file, /* in: file name */ ulint line, /* in: line number */ mtr_t* mtr) /* in: mtr */ { diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index 43741f79855..6549a3748df 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -11,9 +11,11 @@ Created 10/21/1995 Heikki Tuuri #include "univ.i" +#ifndef __WIN__ +#include <dirent.h> +#include <sys/stat.h> +#endif -/* If the following is set to TRUE, we do not call os_file_flush in every -os_file_write */ extern ibool os_do_not_call_flush_at_each_write; extern ibool os_has_said_disk_full; extern ibool os_aio_print_debug; @@ -60,9 +62,12 @@ log. */ #define OS_FILE_OPEN 51 #define OS_FILE_CREATE 52 #define OS_FILE_OVERWRITE 53 +#define OS_FILE_OPEN_RAW 54 +#define OS_FILE_CREATE_PATH 55 #define OS_FILE_READ_ONLY 333 #define OS_FILE_READ_WRITE 444 +#define OS_FILE_READ_ALLOW_DELETE 555 /* for ibbackup */ /* Options for file_create */ #define OS_FILE_AIO 61 @@ -120,6 +125,36 @@ extern ulint os_n_file_reads; extern ulint os_n_file_writes; extern ulint os_n_fsyncs; +/* File types for directory entry data type */ + +enum os_file_type_enum{ + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK /* symbolic link */ +}; +typedef enum os_file_type_enum os_file_type_t; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes */ +#define OS_FILE_MAX_PATH 4000 + +/* Struct used in fetching information of a file in a directory */ +typedef struct os_file_stat_struct os_file_stat_t; +struct os_file_stat_struct{ + char name[OS_FILE_MAX_PATH]; /* path to a file */ + os_file_type_t type; /* file type */ + ib_longlong size; /* file size */ +}; + +#ifdef __WIN__ +typedef HANDLE os_file_dir_t; /* directory stream */ +#else +typedef DIR* os_file_dir_t; /* directory stream */ +#endif + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -140,58 +175,156 @@ FILE* os_file_create_tmpfile(void); /*========================*/ /* out: temporary file handle (never NULL) */ +/*************************************************************************** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. */ + +os_file_dir_t +os_file_opendir( +/*============*/ + /* out: directory stream, NULL if + error */ + const char* dirname, /* in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal);/* in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +/*************************************************************************** +Closes a directory stream. */ + +int +os_file_closedir( +/*=============*/ + /* out: 0 if success, -1 if failure */ + os_file_dir_t dir); /* in: directory stream */ +/*************************************************************************** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. */ + +int +os_file_readdir_next_file( +/*======================*/ + /* out: 0 if ok, -1 if error, 1 if at the end + of the directory */ + const char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info); /* in/out: buffer where the info is returned */ +/********************************************************************* +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix, the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. */ + +ibool +os_file_create_directory( +/*=====================*/ + /* out: TRUE if call succeeds, + FALSE on error */ + const char* pathname, /* in: directory name as + null-terminated string */ + ibool fail_if_exists);/* in: if TRUE, pre-existing directory + is treated as an error. */ /******************************************************************** A simple function to open or create a file. */ os_file_t os_file_create_simple( /*==================*/ - /* out, own: handle to the file, not defined if error, - error number can be retrieved with os_get_last_error */ - char* name, /* in: name of the file or path as a null-terminated - string */ - ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened - (if does not exist, error), or OS_FILE_CREATE if a new - file is created (if exists, error) */ - ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success);/* out: TRUE if succeed, FALSE if error */ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is + opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), or + OS_FILE_CREATE_PATH if new file + (if exists, error) and subdirectories along + its path are created (if needed)*/ + ulint access_type,/* in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ /******************************************************************** A simple function to open or create a file. */ os_file_t os_file_create_simple_no_error_handling( /*====================================*/ - /* out, own: handle to the file, not defined if error, - error number can be retrieved with os_get_last_error */ - char* name, /* in: name of the file or path as a null-terminated - string */ - ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened - (if does not exist, error), or OS_FILE_CREATE if a new - file is created (if exists, error) */ - ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success);/* out: TRUE if succeed, FALSE if error */ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ /******************************************************************** Opens an existing file or creates a new. */ os_file_t os_file_create( /*===========*/ - /* out, own: handle to the file, not defined if error, - error number can be retrieved with os_get_last_error */ - char* name, /* in: name of the file or path as a null-terminated - string */ - ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened - (if does not exist, error), or OS_FILE_CREATE if a new - file is created (if exists, error), OS_FILE_OVERWRITE - if a new file is created or an old overwritten */ - ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o - is desired, OS_FILE_NORMAL, if any normal file; - NOTE that it also depends on type, os_aio_.. and srv_.. - variables whether we really use async i/o or - unbuffered i/o: look in the function source code for - the exact rules */ - ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success);/* out: TRUE if succeed, FALSE if error */ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), + OS_FILE_OVERWRITE if a new file is created + or an old overwritten; + OS_FILE_OPEN_RAW, if a raw device or disk + partition should be opened */ + ulint purpose,/* in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ +/*************************************************************************** +Deletes a file. The file has to be closed before calling this. */ + +ibool +os_file_delete( +/*===========*/ + /* out: TRUE if success */ + const char* name); /* in: file path as a null-terminated string */ + +/*************************************************************************** +Deletes a file if it exists. The file has to be closed before calling this. */ + +ibool +os_file_delete_if_exists( +/*=====================*/ + /* out: TRUE if success */ + const char* name); /* in: file path as a null-terminated string */ +/*************************************************************************** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. */ + +ibool +os_file_rename( +/*===========*/ + /* out: TRUE if success */ + const char* oldpath, /* in: old file path as a + null-terminated string */ + const char* newpath); /* in: new file path */ /*************************************************************************** Closes a file handle. In case of error, error number can be retrieved with os_file_get_last_error. */ @@ -221,13 +354,21 @@ os_file_get_size( size */ ulint* size_high);/* out: most significant 32 bits of size */ /*************************************************************************** +Gets file size as a 64-bit integer ib_longlong. */ + +ib_longlong +os_file_get_size_as_iblonglong( +/*===========================*/ + /* out: size in bytes, -1 if error */ + os_file_t file); /* in: handle to a file */ +/*************************************************************************** Sets a file size. This function can be used to extend or truncate a file. */ ibool os_file_set_size( /*=============*/ /* out: TRUE if success */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ os_file_t file, /* in: handle to a file */ ulint size, /* in: least significant 32 bits of file @@ -256,9 +397,12 @@ overwrite the error number). If the number is not known to this program, the OS error number + 100 is returned. */ ulint -os_file_get_last_error(void); -/*========================*/ - /* out: error number, or OS error number + 100 */ +os_file_get_last_error( +/*===================*/ + /* out: error number, or OS error + number + 100 */ + ibool report_all_errors); /* in: TRUE if we want an error message + printed of all errors */ /*********************************************************************** Requests a synchronous read operation. */ @@ -275,6 +419,23 @@ os_file_read( offset */ ulint n); /* in: number of bytes to read */ /*********************************************************************** +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. */ + +ibool +os_file_read_no_error_handling( +/*===========================*/ + /* out: TRUE if request was + successful, FALSE if fail */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint offset, /* in: least significant 32 bits of file + offset where to read */ + ulint offset_high,/* in: most significant 32 bits of + offset */ + ulint n); /* in: number of bytes to read */ + +/*********************************************************************** Requests a synchronous write operation. */ ibool @@ -282,15 +443,68 @@ os_file_write( /*==========*/ /* out: TRUE if request was successful, FALSE if fail */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ os_file_t file, /* in: handle to a file */ - void* buf, /* in: buffer from which to write */ + const void* buf, /* in: buffer from which to write */ ulint offset, /* in: least significant 32 bits of file offset where to write */ ulint offset_high,/* in: most significant 32 bits of offset */ ulint n); /* in: number of bytes to write */ +/*********************************************************************** +Check the existence and type of the given file. */ + +ibool +os_file_status( +/*===========*/ + /* out: TRUE if call succeeded */ + const char* path, /* in: pathname of the file */ + ibool* exists, /* out: TRUE if file exists */ + os_file_type_t* type); /* out: type of the file (if it exists) */ +/******************************************************************** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' charac +ters are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." +*/ + +char* +os_file_dirname( +/*============*/ + /* out, own: directory component of the + pathname */ + const char* path); /* in: pathname */ +/******************************************************************** +Creates all missing subdirectories along the given path. */ + +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + /* out: TRUE if call succeeded + FALSE otherwise */ + const char* path); /* in: path name */ /**************************************************************************** Initializes the asynchronous io system. Creates separate aio array for non-ibuf read and write, a third aio array for the ibuf i/o, with just one @@ -330,7 +544,7 @@ os_aio( because i/os are not actually handled until all have been posted: use with great caution! */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ os_file_t file, /* in: handle to a file */ void* buf, /* in: buffer where to read or from which diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h index 7618032a11f..d0d3cf82e38 100644 --- a/innobase/include/os0proc.h +++ b/innobase/include/os0proc.h @@ -15,6 +15,76 @@ Created 9/30/1995 Heikki Tuuri typedef void* os_process_t; typedef unsigned long int os_process_id_t; +/* The cell type in os_awe_allocate_mem page info */ +#if defined(__WIN2000__) && defined(ULONG_PTR) +typedef ULONG_PTR os_awe_t; +#else +typedef ulint os_awe_t; +#endif + +/* Physical page size when Windows AWE is used. This is the normal +page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB +pages. */ +#define OS_AWE_X86_PAGE_SIZE 4096 + +/******************************************************************** +Windows AWE support. Tries to enable the "lock pages in memory" privilege for +the current process so that the current process can allocate memory-locked +virtual address space to act as the window where AWE maps physical memory. */ + +ibool +os_awe_enable_lock_pages_in_mem(void); +/*=================================*/ + /* out: TRUE if success, FALSE if error; + prints error info to stderr if no success */ +/******************************************************************** +Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86 +processor. */ + +ibool +os_awe_allocate_physical_mem( +/*=========================*/ + /* out: TRUE if success */ + os_awe_t** page_info, /* out, own: array of opaque data containing + the info for allocated physical memory pages; + each allocated 4 kB physical memory page has + one slot of type os_awe_t in the array */ + ulint n_megabytes); /* in: number of megabytes to allocate */ +/******************************************************************** +Allocates a window in the virtual address space where we can map then +pages of physical memory. */ + +byte* +os_awe_allocate_virtual_mem_window( +/*===============================*/ + /* out, own: allocated memory, or NULL if did not + succeed */ + ulint size); /* in: virtual memory allocation size in bytes, must + be < 2 GB */ +/******************************************************************** +With this function you can map parts of physical memory allocated with +the ..._allocate_physical_mem to the virtual address space allocated with +the previous function. Intel implements this so that the process page +tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP +showed that this takes < 1 microsecond, much better than the estimated 80 us +for copying a 16 kB page memory to memory. But, the operation will at least +partially invalidate the translation lookaside buffer (TLB) of all +processors. Under a real-world load the performance hit may be bigger. */ + +ibool +os_awe_map_physical_mem_to_window( +/*==============================*/ + /* out: TRUE if success; the function + calls exit(1) in case of an error */ + byte* ptr, /* in: a page-aligned pointer to + somewhere in the virtual address + space window; we map the physical mem + pages here */ + ulint n_mem_pages, /* in: number of 4 kB mem pages to + map */ + os_awe_t* page_info); /* in: array of page infos for those + pages; each page has one slot in the + array */ /******************************************************************** Converts the current process id to a number. It is not guaranteed that the number is unique. In Linux returns the 'process number' of the current diff --git a/innobase/include/os0sync.h b/innobase/include/os0sync.h index e1cf263216e..d27b1676f1b 100644 --- a/innobase/include/os0sync.h +++ b/innobase/include/os0sync.h @@ -87,9 +87,9 @@ explicitly by calling sync_os_reset_event. */ os_event_t os_event_create( /*============*/ - /* out: the event handle */ - char* name); /* in: the name of the event, if NULL - the event is created without a name */ + /* out: the event handle */ + const char* name); /* in: the name of the event, if NULL + the event is created without a name */ #ifdef __WIN__ /************************************************************* Creates an auto-reset event semaphore, i.e., an event which is automatically @@ -98,9 +98,9 @@ reset when a single thread is released. Works only in Windows. */ os_event_t os_event_create_auto( /*=================*/ - /* out: the event handle */ - char* name); /* in: the name of the event, if NULL - the event is created without a name */ + /* out: the event handle */ + const char* name); /* in: the name of the event, if NULL + the event is created without a name */ #endif /************************************************************** Sets an event semaphore to the signaled state: lets waiting threads @@ -171,9 +171,9 @@ mutex semaphore of InnoDB itself (mutex_t) should be used where possible. */ os_mutex_t os_mutex_create( /*============*/ - /* out: the mutex handle */ - char* name); /* in: the name of the mutex, if NULL - the mutex is created without a name */ + /* out: the mutex handle */ + const char* name); /* in: the name of the mutex, if NULL + the mutex is created without a name */ /************************************************************** Acquires ownership of a mutex semaphore. */ diff --git a/innobase/include/os0thread.h b/innobase/include/os0thread.h index 6603229e524..c00d28baf60 100644 --- a/innobase/include/os0thread.h +++ b/innobase/include/os0thread.h @@ -78,6 +78,10 @@ os_thread_create( function */ os_thread_id_t* thread_id); /* out: id of the created thread */ +int +os_thread_join( +/*=============*/ + os_thread_id_t thread_id); /* in: id of the thread to join */ /********************************************************************* Exits the current thread. */ diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index 04f771c3abd..969313614e3 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -596,7 +596,8 @@ byte* page_parse_delete_rec_list( /*=======================*/ /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or MLOG_LIST_START_DELETE */ + byte type, /* in: MLOG_LIST_END_DELETE or + MLOG_LIST_START_DELETE */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ page_t* page, /* in: page or NULL */ diff --git a/innobase/include/pars0pars.h b/innobase/include/pars0pars.h index 2e86a7e5534..28985e2f9d0 100644 --- a/innobase/include/pars0pars.h +++ b/innobase/include/pars0pars.h @@ -74,8 +74,8 @@ Parses an SQL string returning the query graph. */ que_t* pars_sql( /*=====*/ - /* out, own: the query graph */ - char* str); /* in: SQL string */ + /* out, own: the query graph */ + const char* str); /* in: SQL string */ /***************************************************************** Retrieves characters to the lexical analyzer. */ @@ -92,7 +92,7 @@ Called by yyparse on error. */ void yyerror( /*====*/ - char* s); /* in: error message string */ + const char* s); /* in: error message string */ /************************************************************************* Parses a variable declaration. */ diff --git a/innobase/include/pars0sym.h b/innobase/include/pars0sym.h index 3060fd06c8f..a40523861dd 100644 --- a/innobase/include/pars0sym.h +++ b/innobase/include/pars0sym.h @@ -122,7 +122,7 @@ struct sym_node_struct{ SYM_IMPLICIT_VAR, SYM_LIT, SYM_TABLE, SYM_CURSOR, ... */ - char* name; /* name of an id */ + const char* name; /* name of an id */ ulint name_len; /* id name length */ dict_table_t* table; /* table definition if a table id or a @@ -150,7 +150,7 @@ struct sym_tab_struct{ que_t* query_graph; /* query graph generated by the parser */ - char* sql_string; + const char* sql_string; /* SQL string to parse */ int string_len; /* SQL string length */ diff --git a/innobase/include/que0types.h b/innobase/include/que0types.h index c7ce09db40b..e59c2313a5a 100644 --- a/innobase/include/que0types.h +++ b/innobase/include/que0types.h @@ -36,7 +36,8 @@ struct que_common_struct{ if the buffer has been allocated dynamically: if this field is != 0, and the node is a symbol node or a function node, then we - have to free the data field in val explicitly */ + have to free the data field in val + explicitly */ }; #endif diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index ff9d1495198..86bf263170f 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -21,7 +21,7 @@ Created 5/30/1994 Heikki Tuuri /* Flag denoting the predefined minimum record: this bit is ORed in the 4 info bits of a record */ -#define REC_INFO_MIN_REC_FLAG 0x10 +#define REC_INFO_MIN_REC_FLAG 0x10UL /* Number of extra bytes in a record, in addition to the data and the offsets */ @@ -397,8 +397,8 @@ rec_print( /* Maximum lengths for the data in a physical record if the offsets are given in one byte (resp. two byte) format. */ -#define REC_1BYTE_OFFS_LIMIT 0x7F -#define REC_2BYTE_OFFS_LIMIT 0x7FFF +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL /* The data size of record must be smaller than this because we reserve two upmost bits in a two byte offset for special purposes */ diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index 75a8bdfd6bd..c36bf8f6d6e 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -29,41 +29,41 @@ significant bytes and bits are written below less significant. and the shift needed to obtain each bit-field of the record. */ #define REC_NEXT 2 -#define REC_NEXT_MASK 0xFFFF +#define REC_NEXT_MASK 0xFFFFUL #define REC_NEXT_SHIFT 0 #define REC_SHORT 3 /* This is single byte bit-field */ -#define REC_SHORT_MASK 0x1 +#define REC_SHORT_MASK 0x1UL #define REC_SHORT_SHIFT 0 #define REC_N_FIELDS 4 -#define REC_N_FIELDS_MASK 0x7FE +#define REC_N_FIELDS_MASK 0x7FEUL #define REC_N_FIELDS_SHIFT 1 #define REC_HEAP_NO 5 -#define REC_HEAP_NO_MASK 0xFFF8 +#define REC_HEAP_NO_MASK 0xFFF8UL #define REC_HEAP_NO_SHIFT 3 #define REC_N_OWNED 6 /* This is single byte bit-field */ -#define REC_N_OWNED_MASK 0xF +#define REC_N_OWNED_MASK 0xFUL #define REC_N_OWNED_SHIFT 0 -#define REC_INFO_BITS_MASK 0xF0 +#define REC_INFO_BITS_MASK 0xF0UL #define REC_INFO_BITS_SHIFT 0 /* The deleted flag in info bits */ -#define REC_INFO_DELETED_FLAG 0x20 /* when bit is set to 1, it means the +#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the record has been delete marked */ /* The following masks are used to filter the SQL null bit from one-byte and two-byte offsets */ -#define REC_1BYTE_SQL_NULL_MASK 0x80 -#define REC_2BYTE_SQL_NULL_MASK 0x8000 +#define REC_1BYTE_SQL_NULL_MASK 0x80UL +#define REC_2BYTE_SQL_NULL_MASK 0x8000UL /* In a 2-byte offset the second most significant bit denotes a field stored to another page: */ -#define REC_2BYTE_EXTERN_MASK 0x4000 +#define REC_2BYTE_EXTERN_MASK 0x4000UL /**************************************************************** Return field length or UNIV_SQL_NULL. */ @@ -133,7 +133,7 @@ rec_set_bit_field_1( ut_ad(rec); ut_ad(offs <= REC_N_EXTRA_BYTES); ut_ad(mask); - ut_ad(mask <= 0xFF); + ut_ad(mask <= 0xFFUL); ut_ad(((mask >> shift) << shift) == mask); ut_ad(((val << shift) & mask) == (val << shift)); @@ -172,8 +172,8 @@ rec_set_bit_field_2( { ut_ad(rec); ut_ad(offs <= REC_N_EXTRA_BYTES); - ut_ad(mask > 0xFF); - ut_ad(mask <= 0xFFFF); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); ut_ad((mask >> shift) & 1); ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); ut_ad(((mask >> shift) << shift) == mask); @@ -188,8 +188,8 @@ rec_set_bit_field_2( + (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4))) + (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4))) + (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3))) - + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); - if (m != ut_dbg_zero + 0xFFFFFFFF) { + + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); + if (m != ut_dbg_zero + 0xFFFFFFFFUL) { fprintf(stderr, "Sum of masks %lx\n", m); ut_error; } diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 0ab70db2dea..8f6264944ce 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -52,6 +52,14 @@ row_mysql_read_var_ref_noninline( ulint* len, /* out: variable-length field length */ byte* field); /* in: field */ /*********************************************************************** +Frees the blob heap in prebuilt when no longer needed. */ + +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt); /* in: prebuilt struct of a + ha_innobase:: table handle */ +/*********************************************************************** Stores a reference to a BLOB in the MySQL format. */ void @@ -309,15 +317,16 @@ fields than mentioned in the constraint. */ int row_table_add_foreign_constraints( /*==============================*/ - /* out: error code or DB_SUCCESS */ - trx_t* trx, /* in: transaction */ - char* sql_string, /* in: table create statement where - foreign keys are declared like: + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: FOREIGN KEY (a, b) REFERENCES table2(c, d), - table2 can be written also with the database - name before it: test.table2 */ - char* name); /* in: table full name in the normalized form - database_name/table_name */ + table2 can be written also with the + database name before it: test.table2 */ + const char* name); /* in: table full name in the + normalized form + database_name/table_name */ /************************************************************************* The master thread in srv0srv.c calls this regularly to drop tables which we must drop in background after queries to them have ended. Such lazy @@ -344,29 +353,69 @@ output by the master thread. */ int row_drop_table_for_mysql( /*=====================*/ - /* out: error code or DB_SUCCESS */ - char* name, /* in: table name */ - trx_t* trx, /* in: transaction handle */ - ibool drop_db);/* in: TRUE=dropping whole database */ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx, /* in: transaction handle */ + ibool drop_db);/* in: TRUE=dropping whole database */ + +/************************************************************************* +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. + +How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do DISCARD TABLESPACE. Then there are no running +queries on the table. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: we remove all entries for the tablespace in the insert +buffer tree; as long as the tablespace mem object does not exist, ongoing +insert buffer page merges are discarded in buf0rea.c. If we recreate the +tablespace mem object with IMPORT TABLESPACE later, then the tablespace will +have the same id, but the tablespace_version field in the mem object is +different, and ongoing old insert buffer page merges get discarded. +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. */ + +int +row_discard_tablespace_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx); /* in: transaction handle */ +/********************************************************************* +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. */ + +int +row_import_tablespace_for_mysql( +/*============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx); /* in: transaction handle */ /************************************************************************* Drops a database for MySQL. */ int row_drop_database_for_mysql( /*========================*/ - /* out: error code or DB_SUCCESS */ - char* name, /* in: database name which ends to '/' */ - trx_t* trx); /* in: transaction handle */ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: database name which ends to '/' */ + trx_t* trx); /* in: transaction handle */ /************************************************************************* Renames a table for MySQL. */ int row_rename_table_for_mysql( /*=======================*/ - /* out: error code or DB_SUCCESS */ - char* old_name, /* in: old table name */ - char* new_name, /* in: new table name */ - trx_t* trx); /* in: transaction handle */ + /* out: error code or DB_SUCCESS */ + const char* old_name, /* in: old table name */ + const char* new_name, /* in: new table name */ + trx_t* trx); /* in: transaction handle */ /************************************************************************* Checks a table for corruption. */ @@ -481,13 +530,15 @@ struct row_prebuilt_struct { byte* ins_upd_rec_buff;/* buffer for storing data converted to the Innobase format from the MySQL format */ - ibool hint_no_need_to_fetch_extra_cols; - /* normally this is TRUE, but - MySQL will set this to FALSE - if we might be required to fetch also - other columns than mentioned in the - query: the clustered index column(s), - or an auto-increment column*/ + ulint hint_need_to_fetch_extra_cols; + /* normally this is set to 0; if this + is set to ROW_RETRIEVE_PRIMARY_KEY, + then we should at least retrieve all + columns in the primary key; if this + is set to ROW_RETRIEVE_ALL_COLS, then + we must retrieve all columns in the + key (if read_just_key == 1), or all + columns in the table */ upd_node_t* upd_node; /* Innobase SQL update node used to perform updates and deletes */ que_fork_t* ins_graph; /* Innobase SQL query graph used @@ -546,6 +597,11 @@ struct row_prebuilt_struct { #define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in row_scan_and_check_index */ +/* Values for hint_need_to_fetch_extra_cols */ +#define ROW_RETRIEVE_PRIMARY_KEY 1 +#define ROW_RETRIEVE_ALL_COLS 2 + + #ifndef UNIV_NONINL #include "row0mysql.ic" #endif diff --git a/innobase/include/row0sel.h b/innobase/include/row0sel.h index 5ef7ff9399a..0be224eb255 100644 --- a/innobase/include/row0sel.h +++ b/innobase/include/row0sel.h @@ -118,7 +118,8 @@ row_search_for_mysql( /*=================*/ /* out: DB_SUCCESS, DB_RECORD_NOT_FOUND, - DB_END_OF_INDEX, or DB_DEADLOCK */ + DB_END_OF_INDEX, DB_DEADLOCK, + or DB_TOO_BIG_RECORD */ byte* buf, /* in/out: buffer for the fetched row in the MySQL format */ ulint mode, /* in: search mode PAGE_CUR_L, ... */ @@ -143,11 +144,11 @@ consistent read result, or store it to the query cache. */ ibool row_search_check_if_query_cache_permitted( /*======================================*/ - /* out: TRUE if storing or retrieving from - the query cache is permitted */ - trx_t* trx, /* in: transaction object */ - char* norm_name); /* in: concatenation of database name, '/' - char, table name */ + /* out: TRUE if storing or retrieving + from the query cache is permitted */ + trx_t* trx, /* in: transaction object */ + const char* norm_name); /* in: concatenation of database name, + '/' char, table name */ /* A structure for caching column values for prefetched rows */ diff --git a/innobase/include/row0sel.ic b/innobase/include/row0sel.ic index 509838a3327..595cea1138b 100644 --- a/innobase/include/row0sel.ic +++ b/innobase/include/row0sel.ic @@ -77,7 +77,7 @@ open_step( if (err != DB_SUCCESS) { /* SQL error detected */ - fprintf(stderr, "SQL error %lu\n", err); + fprintf(stderr, "SQL error %lu\n", (ulong) err); ut_error; } diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic index 65667f1f00d..d89938d696a 100644 --- a/innobase/include/row0upd.ic +++ b/innobase/include/row0upd.ic @@ -85,11 +85,11 @@ upd_field_set_field_no( if (field_no >= dict_index_get_n_fields(index)) { fprintf(stderr, "InnoDB: Error: trying to access field %lu in ", - field_no); + (ulong) field_no); dict_index_name_print(stderr, index); fprintf(stderr, "\n" "InnoDB: but index only has %lu fields\n", - dict_index_get_n_fields(index)); + (ulong) dict_index_get_n_fields(index)); } dtype_copy(dfield_get_type(&(upd_field->new_val)), diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 0be13528fd7..c7ba39aaaf1 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -16,10 +16,7 @@ Created 10/10/1995 Heikki Tuuri #include "que0types.h" #include "trx0types.h" -extern char* srv_main_thread_op_info; - -/* Buffer which can be used in printing fatal error messages */ -extern char srv_fatal_errbuf[]; +extern const char* srv_main_thread_op_info; /* When this event is set the lock timeout and InnoDB monitor thread starts running */ @@ -40,7 +37,11 @@ extern FILE* srv_monitor_file; /* Server parameters which are read from the initfile */ extern char* srv_data_home; +#ifdef UNIV_LOG_ARCHIVE extern char* srv_arch_dir; +#endif /* UNIV_LOG_ARCHIVE */ + +extern ibool srv_file_per_table; extern ulint srv_n_data_files; extern char** srv_data_file_names; @@ -60,20 +61,23 @@ extern char** srv_log_group_home_dirs; extern ulint srv_n_log_groups; extern ulint srv_n_log_files; extern ulint srv_log_file_size; -extern ibool srv_log_archive_on; extern ulint srv_log_buffer_size; extern ulint srv_flush_log_at_trx_commit; extern byte srv_latin1_ordering[256];/* The sort order table of the latin1 character set */ extern ulint srv_pool_size; +extern ulint srv_awe_window_size; extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; extern ulint srv_n_file_io_threads; +#ifdef UNIV_LOG_ARCHIVE +extern ibool srv_log_archive_on; extern ibool srv_archive_recovery; extern dulint srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ extern ulint srv_lock_wait_timeout; @@ -81,12 +85,14 @@ extern char* srv_file_flush_method_str; extern ulint srv_unix_file_flush_method; extern ulint srv_win_file_flush_method; +extern ulint srv_max_n_open_files; + extern ulint srv_max_dirty_pages_pct; extern ulint srv_force_recovery; extern ulint srv_thread_concurrency; -extern ulint srv_max_n_threads; +extern ulint srv_max_n_threads; extern lint srv_conc_n_threads; @@ -97,6 +103,8 @@ extern ibool srv_use_doublewrite_buf; extern ibool srv_set_thread_priorities; extern int srv_query_thread_priority; +extern ibool srv_use_awe; +extern ibool srv_use_adaptive_hash_indexes; /*-------------------------------------------*/ extern ulint srv_n_rows_inserted; diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h index c4c8dac5d7a..0074de537c3 100644 --- a/innobase/include/srv0start.h +++ b/innobase/include/srv0start.h @@ -11,6 +11,7 @@ Created 10/10/1995 Heikki Tuuri #define srv0start_h #include "univ.i" +#include "ut0byte.h" /************************************************************************* Normalizes a directory path for Windows: converts slashes to backslashes. */ @@ -69,12 +70,17 @@ innobase_shutdown_for_mysql(void); /*=============================*/ /* out: DB_SUCCESS or error code */ +extern dulint srv_shutdown_lsn; +extern dulint srv_start_lsn; + extern ulint srv_sizeof_trx_t_in_ha_innodb_cc; extern ibool srv_is_being_started; extern ibool srv_startup_is_before_trx_rollback_phase; extern ibool srv_is_being_shut_down; +extern ibool srv_start_raw_disk_in_use; + /* At a shutdown the value first climbs from 0 to SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ @@ -84,4 +90,7 @@ extern ulint srv_shutdown_state; #define SRV_SHUTDOWN_LAST_PHASE 2 #define SRV_SHUTDOWN_EXIT_THREADS 3 +/* Log 'spaces' have id's >= this */ +#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL + #endif diff --git a/innobase/include/sync0arr.h b/innobase/include/sync0arr.h index 4324f2d3f2c..92691d5fdd9 100644 --- a/innobase/include/sync0arr.h +++ b/innobase/include/sync0arr.h @@ -51,7 +51,7 @@ sync_array_reserve_cell( sync_array_t* arr, /* in: wait array */ void* object, /* in: pointer to the object to wait for */ ulint type, /* in: lock request type */ - char* file, /* in: file where requested */ + const char* file, /* in: file where requested */ ulint line, /* in: line where requested */ ulint* index); /* out: index of the reserved cell */ /********************************************************************** diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h index d71691b4353..9a988a03e92 100644 --- a/innobase/include/sync0rw.h +++ b/innobase/include/sync0rw.h @@ -62,7 +62,7 @@ location (which must be appropriately aligned). The rw-lock is initialized to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free is necessary only if the memory block containing it is freed. */ -#define rw_lock_create(L) rw_lock_create_func((L), IB__FILE__, __LINE__) +#define rw_lock_create(L) rw_lock_create_func((L), __FILE__, __LINE__) /*=====================*/ /********************************************************************** Creates, or rather, initializes an rw-lock object in a specified memory @@ -74,7 +74,7 @@ void rw_lock_create_func( /*================*/ rw_lock_t* lock, /* in: pointer to memory */ - char* cfile_name, /* in: file name where created */ + const char* cfile_name, /* in: file name where created */ ulint cline); /* in: file line where created */ /********************************************************************** Calling this function is obligatory only if the memory buffer containing @@ -98,19 +98,19 @@ NOTE! The following macros should be used in rw s-locking, not the corresponding function. */ #define rw_lock_s_lock(M) rw_lock_s_lock_func(\ - (M), 0, IB__FILE__, __LINE__) + (M), 0, __FILE__, __LINE__) /****************************************************************** NOTE! The following macros should be used in rw s-locking, not the corresponding function. */ #define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(\ - (M), (P), IB__FILE__, __LINE__) + (M), (P), __FILE__, __LINE__) /****************************************************************** NOTE! The following macros should be used in rw s-locking, not the corresponding function. */ #define rw_lock_s_lock_nowait(M) rw_lock_s_lock_func_nowait(\ - (M), IB__FILE__, __LINE__) + (M), __FILE__, __LINE__) /********************************************************************** NOTE! Use the corresponding macro, not directly this function, except if you supply the file name and line number. Lock an rw-lock in shared mode @@ -125,7 +125,7 @@ rw_lock_s_lock_func( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line); /* in: line where requested */ /********************************************************************** NOTE! Use the corresponding macro, not directly this function, except if @@ -137,7 +137,7 @@ rw_lock_s_lock_func_nowait( /*=======================*/ /* out: TRUE if success */ rw_lock_t* lock, /* in: pointer to rw-lock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line); /* in: line where requested */ /********************************************************************** NOTE! Use the corresponding macro, not directly this function! Lock an @@ -149,7 +149,7 @@ rw_lock_x_lock_func_nowait( /*=======================*/ /* out: TRUE if success */ rw_lock_t* lock, /* in: pointer to rw-lock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line); /* in: line where requested */ /********************************************************************** Releases a shared mode lock. */ @@ -184,19 +184,19 @@ NOTE! The following macro should be used in rw x-locking, not the corresponding function. */ #define rw_lock_x_lock(M) rw_lock_x_lock_func(\ - (M), 0, IB__FILE__, __LINE__) + (M), 0, __FILE__, __LINE__) /****************************************************************** NOTE! The following macro should be used in rw x-locking, not the corresponding function. */ #define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(\ - (M), (P), IB__FILE__, __LINE__) + (M), (P), __FILE__, __LINE__) /****************************************************************** NOTE! The following macros should be used in rw x-locking, not the corresponding function. */ #define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(\ - (M), IB__FILE__, __LINE__) + (M), __FILE__, __LINE__) /********************************************************************** NOTE! Use the corresponding macro, not directly this function! Lock an rw-lock in exclusive mode for the current thread. If the rw-lock is locked @@ -213,7 +213,7 @@ rw_lock_x_lock_func( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line); /* in: line where requested */ /********************************************************************** Releases an exclusive mode lock. */ @@ -251,9 +251,9 @@ UNIV_INLINE void rw_lock_s_lock_direct( /*==================*/ - rw_lock_t* lock /* in: pointer to rw-lock */ - ,char* file_name, /* in: file name where lock requested */ - ulint line /* in: line where requested */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line /* in: line where lock requested */ ); /********************************************************************** Low-level function which locks an rw-lock in x-mode when we know that it @@ -263,9 +263,9 @@ UNIV_INLINE void rw_lock_x_lock_direct( /*==================*/ - rw_lock_t* lock /* in: pointer to rw-lock */ - ,char* file_name, /* in: file name where lock requested */ - ulint line /* in: line where requested */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line /* in: line where lock requested */ ); /********************************************************************** This function is used in the insert buffer to move the ownership of an @@ -449,10 +449,10 @@ struct rw_lock_struct { #endif /* UNIV_SYNC_DEBUG */ ulint level; /* Level in the global latching order; default SYNC_LEVEL_NONE */ - char* cfile_name; /* File name where lock created */ + const char* cfile_name;/* File name where lock created */ ulint cline; /* Line where created */ - char* last_s_file_name;/* File name where last time s-locked */ - char* last_x_file_name;/* File name where last time x-locked */ + const char* last_s_file_name;/* File name where last s-locked */ + const char* last_x_file_name;/* File name where last x-locked */ ulint last_s_line; /* Line number where last time s-locked */ ulint last_x_line; /* Line number where last time x-locked */ ulint magic_n; @@ -469,7 +469,7 @@ struct rw_lock_debug_struct { ulint pass; /* Pass value given in the lock operation */ ulint lock_type; /* Type of the lock: RW_LOCK_EX, RW_LOCK_SHARED, RW_LOCK_WAIT_EX */ - char* file_name; /* File name where the lock was obtained */ + const char* file_name;/* File name where the lock was obtained */ ulint line; /* Line where the rw-lock was locked */ UT_LIST_NODE_T(rw_lock_debug_t) list; /* Debug structs are linked in a two-way diff --git a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic index 8fc93f4a9da..3a92100ba01 100644 --- a/innobase/include/sync0rw.ic +++ b/innobase/include/sync0rw.ic @@ -18,7 +18,7 @@ rw_lock_s_lock_spin( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line); /* in: line where requested */ #ifdef UNIV_SYNC_DEBUG /********************************************************************** @@ -30,7 +30,7 @@ rw_lock_add_debug_info( rw_lock_t* lock, /* in: rw-lock */ ulint pass, /* in: pass value */ ulint lock_type, /* in: lock type */ - char* file_name, /* in: file where requested */ + const char* file_name, /* in: file where requested */ ulint line); /* in: line where requested */ /********************************************************************** Removes a debug information struct for an rw-lock. */ @@ -130,7 +130,7 @@ rw_lock_s_lock_low( ulint pass __attribute__((unused)), /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name, /* in: file name where lock requested */ + const char* file_name, /* in: file name where lock requested */ ulint line) /* in: line where requested */ { #ifdef UNIV_SYNC_DEBUG @@ -163,9 +163,9 @@ UNIV_INLINE void rw_lock_s_lock_direct( /*==================*/ - rw_lock_t* lock, /* in: pointer to rw-lock */ - char* file_name,/* in: file name where lock requested */ - ulint line) /* in: line where requested */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ { ut_ad(lock->writer == RW_LOCK_NOT_LOCKED); ut_ad(rw_lock_get_reader_count(lock) == 0); @@ -189,9 +189,9 @@ UNIV_INLINE void rw_lock_x_lock_direct( /*==================*/ - rw_lock_t* lock, /* in: pointer to rw-lock */ - char* file_name, /* in: file name where lock requested */ - ulint line) /* in: line where requested */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ { ut_ad(rw_lock_validate(lock)); ut_ad(rw_lock_get_reader_count(lock) == 0); @@ -223,7 +223,7 @@ rw_lock_s_lock_func( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name, /* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line) /* in: line where requested */ { /* NOTE: As we do not know the thread ids for threads which have @@ -267,7 +267,7 @@ rw_lock_s_lock_func_nowait( /*=======================*/ /* out: TRUE if success */ rw_lock_t* lock, /* in: pointer to rw-lock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line) /* in: line where requested */ { ibool success = FALSE; @@ -304,7 +304,7 @@ rw_lock_x_lock_func_nowait( /*=======================*/ /* out: TRUE if success */ rw_lock_t* lock, /* in: pointer to rw-lock */ - char* file_name, /* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line) /* in: line where requested */ { ibool success = FALSE; diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 40197cd61bd..8e0ec715b12 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -36,7 +36,7 @@ in the reset state. Explicit freeing of the mutex with mutex_free is necessary only if the memory block containing it is freed. */ -#define mutex_create(M) mutex_create_func((M), IB__FILE__, __LINE__) +#define mutex_create(M) mutex_create_func((M), __FILE__, __LINE__) /*===================*/ /********************************************************************** Creates, or rather, initializes a mutex object in a specified memory @@ -48,7 +48,7 @@ void mutex_create_func( /*==============*/ mutex_t* mutex, /* in: pointer to memory */ - char* cfile_name, /* in: file name where created */ + const char* cfile_name, /* in: file name where created */ ulint cline); /* in: file line where created */ /********************************************************************** Calling this function is obligatory only if the memory buffer containing @@ -64,7 +64,7 @@ mutex_free( NOTE! The following macro should be used in mutex locking, not the corresponding function. */ -#define mutex_enter(M) mutex_enter_func((M), IB__FILE__, __LINE__) +#define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__) /********************************************************************** A noninlined function that reserves a mutex. In ha_innodb.cc we have disabled inlining of InnoDB functions, and no inlined functions should be called from @@ -80,7 +80,7 @@ corresponding function. */ /* NOTE! currently same as mutex_enter! */ -#define mutex_enter_fast(M) mutex_enter_func((M), IB__FILE__, __LINE__) +#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__) #define mutex_enter_fast_func mutex_enter_func; /********************************************************************** NOTE! Use the corresponding macro in the header file, not this function @@ -92,7 +92,7 @@ void mutex_enter_func( /*=============*/ mutex_t* mutex, /* in: pointer to mutex */ - char* file_name, /* in: file name where locked */ + const char* file_name, /* in: file name where locked */ ulint line); /* in: line where locked */ /************************************************************************ Tries to lock the mutex for the current thread. If the lock is not acquired @@ -103,9 +103,9 @@ mutex_enter_nowait( /*===============*/ /* out: 0 if succeed, 1 if not */ mutex_t* mutex, /* in: pointer to mutex */ - char* file_name, /* in: file name where mutex + const char* file_name, /* in: file name where mutex requested */ - ulint line); /* in: line where requested */ + ulint line); /* in: line where requested */ /********************************************************************** Unlocks a mutex owned by the current thread. */ UNIV_INLINE @@ -390,8 +390,8 @@ or row lock! */ #define SYNC_IBUF_HEADER 914 #define SYNC_IBUF_PESS_INSERT_MUTEX 912 #define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below - SYNC_FSP_PAGE: we assign value this - high only to get the program to pass + SYNC_FSP_PAGE: we assign a value this + high only to make the program to pass the debug checks */ /*-------------------------------*/ #define SYNC_INDEX_TREE 900 @@ -410,7 +410,7 @@ or row lock! */ #define SYNC_FSP_PAGE 395 /*------------------------------------- Insert buffer headers */ /*------------------------------------- ibuf_mutex */ -/*------------------------------------- Insert buffer trees */ +/*------------------------------------- Insert buffer tree */ #define SYNC_IBUF_BITMAP_MUTEX 351 #define SYNC_IBUF_BITMAP 350 /*-------------------------------*/ @@ -468,7 +468,7 @@ struct mutex_struct { #endif /* UNIV_SYNC_DEBUG */ ulint level; /* Level in the global latching order; default SYNC_LEVEL_NONE */ - char* cfile_name; /* File name where mutex created */ + const char* cfile_name;/* File name where mutex created */ ulint cline; /* Line where created */ ulint magic_n; }; diff --git a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic index 758c8524f66..aaf5e1fd9e9 100644 --- a/innobase/include/sync0sync.ic +++ b/innobase/include/sync0sync.ic @@ -23,7 +23,7 @@ void mutex_spin_wait( /*============*/ mutex_t* mutex, /* in: pointer to mutex */ - char* file_name,/* in: file name where mutex requested */ + const char* file_name,/* in: file name where mutex requested */ ulint line); /* in: line where requested */ #ifdef UNIV_SYNC_DEBUG /********************************************************************** @@ -33,7 +33,7 @@ void mutex_set_debug_info( /*=================*/ mutex_t* mutex, /* in: mutex */ - char* file_name, /* in: file where requested */ + const char* file_name, /* in: file where requested */ ulint line); /* in: line where requested */ #endif /* UNIV_SYNC_DEBUG */ /********************************************************************** @@ -241,9 +241,9 @@ UNIV_INLINE void mutex_enter_func( /*=============*/ - mutex_t* mutex, /* in: pointer to mutex */ - char* file_name,/* in: file name where locked */ - ulint line) /* in: line where locked */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where locked */ + ulint line) /* in: line where locked */ { ut_ad(mutex_validate(mutex)); diff --git a/innobase/include/trx0roll.h b/innobase/include/trx0roll.h index 0d7126c9c57..6004551f456 100644 --- a/innobase/include/trx0roll.h +++ b/innobase/include/trx0roll.h @@ -193,7 +193,7 @@ trx_rollback_to_savepoint_for_mysql( DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ trx_t* trx, /* in: transaction handle */ - char* savepoint_name, /* in: savepoint name */ + const char* savepoint_name, /* in: savepoint name */ ib_longlong* mysql_binlog_cache_pos);/* out: the MySQL binlog cache position corresponding to this savepoint; MySQL needs this @@ -211,7 +211,7 @@ trx_savepoint_for_mysql( /*====================*/ /* out: always DB_SUCCESS */ trx_t* trx, /* in: transaction handle */ - char* savepoint_name, /* in: savepoint name */ + const char* savepoint_name, /* in: savepoint name */ ib_longlong binlog_cache_pos); /* in: MySQL binlog cache position corresponding to this connection at the time of the diff --git a/innobase/include/trx0rseg.ic b/innobase/include/trx0rseg.ic index 9a6137eb2e5..35e927f5e79 100644 --- a/innobase/include/trx0rseg.ic +++ b/innobase/include/trx0rseg.ic @@ -67,7 +67,7 @@ trx_rsegf_get_nth_undo( { if (n >= TRX_RSEG_N_SLOTS) { fprintf(stderr, - "InnoDB: Error: trying to get slot %lu of rseg\n", n); + "InnoDB: Error: trying to get slot %lu of rseg\n", (unsigned long) n); ut_error; } @@ -88,7 +88,7 @@ trx_rsegf_set_nth_undo( { if (n >= TRX_RSEG_N_SLOTS) { fprintf(stderr, - "InnoDB: Error: trying to set slot %lu of rseg\n", n); + "InnoDB: Error: trying to set slot %lu of rseg\n", (unsigned long) n); ut_error; } diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h index c7ef4d1929d..8f402881224 100644 --- a/innobase/include/trx0sys.h +++ b/innobase/include/trx0sys.h @@ -24,18 +24,6 @@ Created 3/26/1996 Heikki Tuuri #include "fsp0fsp.h" #include "read0types.h" -/* Do NOT merge this to the 4.1 code base! */ -extern ibool trx_sys_downgrading_from_4_1_1; - -/******************************************************************** -Do NOT merge this to the 4.1 code base! -Marks the trx sys header when we have successfully downgraded from the >= 4.1.1 -multiple tablespace format back to the 4.0 format. */ - -void -trx_sys_mark_downgraded_from_4_1_1(void); -/*====================================*/ - /* In a MySQL replication slave, in crash recovery we store the master log file name and position here. We have successfully got the updates to InnoDB up to this position. If .._pos is -1, it means no crash recovery was needed, @@ -44,26 +32,48 @@ or there was no master log position info inside InnoDB. */ extern char trx_sys_mysql_master_log_name[]; extern ib_longlong trx_sys_mysql_master_log_pos; +/* If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. If .._pos is -1, it means there was no binlog position info inside +InnoDB. */ + +extern char trx_sys_mysql_bin_log_name[]; +extern ib_longlong trx_sys_mysql_bin_log_pos; + /* The transaction system */ extern trx_sys_t* trx_sys; /* Doublewrite system */ extern trx_doublewrite_t* trx_doublewrite; +extern ibool trx_doublewrite_must_reset_space_ids; +extern ibool trx_sys_multiple_tablespace_format; /******************************************************************** -Creates the doublewrite buffer at a database start. The header of the +Creates the doublewrite buffer to a new InnoDB installation. The header of the doublewrite buffer is placed on the trx system header page. */ void trx_sys_create_doublewrite_buf(void); /*================================*/ /******************************************************************** -At a database startup uses a possible doublewrite buffer to restore +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore half-written pages in the data files. */ void -trx_sys_doublewrite_restore_corrupt_pages(void); -/*===========================================*/ +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages); +/******************************************************************** +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ + +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void); +/*===============================================*/ /******************************************************************** Determines if a page number is located inside the doublewrite buffer. */ @@ -256,7 +266,7 @@ replication has proceeded. */ void trx_sys_update_mysql_binlog_offset( /*===============================*/ - char* file_name,/* in: MySQL log file name */ + const char* file_name,/* in: MySQL log file name */ ib_longlong offset, /* in: position in that log file */ ulint field, /* in: offset of the MySQL log info field in the trx sys header */ @@ -369,14 +379,17 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */ to disk, we still may be able to recover the information */ #define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) - /* If this is set to - .._N, then we are - DOWNGRADING from >= 4.1.1 to - 4.0 */ + /* If this is not yet set to + .._N, we must reset the + doublewrite buffer, because + starting from 4.1.x the space + id of a data page is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO */ /*-------------------------------------------------------------*/ #define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 #define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 + #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE /* Doublewrite control struct */ diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 3be16e8f46d..7eb91048684 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -315,7 +315,7 @@ struct trx_struct{ ulint magic_n; /* All the next fields are protected by the kernel mutex, except the undo logs which are protected by undo_mutex */ - char* op_info; /* English text describing the + const char* op_info; /* English text describing the current operation, or an empty string */ ulint type; /* TRX_USER, TRX_PURGE */ @@ -358,7 +358,7 @@ struct trx_struct{ char** mysql_query_str;/* pointer to the field in mysqld_thd which contains the pointer to the current SQL query string */ - char* mysql_log_file_name; + const char* mysql_log_file_name; /* if MySQL binlog is used, this field contains a pointer to the latest file name; this is NULL if binlog is not @@ -366,7 +366,7 @@ struct trx_struct{ ib_longlong mysql_log_offset;/* if MySQL binlog is used, this field contains the end offset of the binlog entry */ - char* mysql_master_log_file_name; + const char* mysql_master_log_file_name; /* if the database server is a MySQL replication slave, we have here the master binlog name up to which diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h index 7f0378c68d3..20002076cc3 100644 --- a/innobase/include/trx0undo.h +++ b/innobase/include/trx0undo.h @@ -251,20 +251,6 @@ trx_undo_update_cleanup( page_t* undo_page, /* in: update undo log header page, x-latched */ mtr_t* mtr); /* in: mtr */ -/************************************************************************** -Discards an undo log and puts the segment to the list of cached update undo -log segments. This optimized function is called if there is no need to -keep the update undo log because there exist no read views and the transaction -made no delete markings, which would make purge necessary. We restrict this -to undo logs of size 1 to make things simpler. */ - -dulint -trx_undo_update_cleanup_by_discard( -/*===============================*/ - /* out: log sequence number at which mtr is - committed */ - trx_t* trx, /* in: trx owning the update undo log */ - mtr_t* mtr); /* in: mtr */ /********************************************************************** Frees or caches an insert undo log after a transaction commit or rollback. Knowledge of inserts is not needed after a commit or rollback, therefore diff --git a/innobase/include/univ.i b/innobase/include/univ.i index 4854e5a7b78..be71d4211b3 100644 --- a/innobase/include/univ.i +++ b/innobase/include/univ.i @@ -65,13 +65,7 @@ Microsoft Visual C++ */ #define HAVE_PWRITE #endif -/* Apparently in some old SCO Unixes the return type of sprintf is not -an integer as it should be according to the modern Posix standard. Because -of that we define sprintf inside InnoDB code as our own function ut_sprintf */ -#undef sprintf -#define sprintf ut_sprintf - -#endif +#endif /* #if (defined(WIN32) || ... */ /* DEBUG VERSION CONTROL ===================== */ @@ -88,10 +82,9 @@ memory is read outside the allocated blocks. */ /* #define UNIV_DEBUG -#define UNIV_SYNC_DEBUG #define UNIV_MEM_DEBUG - #define UNIV_IBUF_DEBUG +#define UNIV_SYNC_DEBUG #define UNIV_SEARCH_DEBUG #define UNIV_SYNC_PERF_STAT #define UNIV_SEARCH_PERF_STAT @@ -182,27 +175,37 @@ management to ensure correct alignment for doubles etc. */ */ /* Note that inside MySQL 'byte' is defined as char on Linux! */ -#define byte unsigned char +#define byte unsigned char -/* Another basic type we use is unsigned long integer which is intended to be -equal to the word size of the machine. */ +/* Another basic type we use is unsigned long integer which should be equal to +the word size of the machine, that is on a 32-bit platform 32 bits, and on a +64-bit platform 64 bits. We also give the printf format for the type as a +macro PRULINT. */ #ifdef _WIN64 typedef unsigned __int64 ulint; +#define ULINTPF "%I64u" +typedef __int64 lint; #else typedef unsigned long int ulint; -#endif - +#define ULINTPF "%lu" typedef long int lint; +#endif #ifdef __WIN__ -typedef __int64 ib_longlong; +typedef __int64 ib_longlong; #else -typedef longlong ib_longlong; +typedef longlong ib_longlong; +#endif + +#ifndef __WIN__ +#if SIZEOF_LONG != SIZEOF_VOIDP +#error "Error: InnoDB's ulint must be of the same size as void*" +#endif #endif /* The following type should be at least a 64-bit floating point number */ -typedef double utfloat; +typedef double utfloat; /* The 'undefined' value for a ulint */ #define ULINT_UNDEFINED ((ulint)(-1)) @@ -215,7 +218,7 @@ typedef double utfloat; /* This 'ibool' type is used within Innobase. Remember that different included headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ -#define ibool ulint +#define ibool ulint #ifndef TRUE @@ -239,11 +242,6 @@ contains the sum of the following flag and the locally stored len. */ #define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE) -/* The following definition of __FILE__ removes compiler warnings -associated with const char* / char* mismatches with __FILE__ */ - -#define IB__FILE__ ((char*)__FILE__) - #include <stdio.h> #include "ut0dbg.h" #include "ut0ut.h" diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h index dd13b19b864..fed6a23d144 100644 --- a/innobase/include/ut0byte.h +++ b/innobase/include/ut0byte.h @@ -152,7 +152,7 @@ ut_dulint_align_up( Increments a dulint variable by 1. */ #define UT_DULINT_INC(D)\ {\ - if ((D).low == 0xFFFFFFFF) {\ + if ((D).low == 0xFFFFFFFFUL) {\ (D).high = (D).high + 1;\ (D).low = 0;\ } else {\ diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic index f0df9cc35a3..5a70dcf12a8 100644 --- a/innobase/include/ut0byte.ic +++ b/innobase/include/ut0byte.ic @@ -152,13 +152,13 @@ ut_dulint_add( dulint a, /* in: dulint */ ulint b) /* in: ulint */ { - if (0xFFFFFFFF - b >= a.low) { + if (0xFFFFFFFFUL - b >= a.low) { a.low += b; return(a); } - a.low = a.low - (0xFFFFFFFF - b) - 1; + a.low = a.low - (0xFFFFFFFFUL - b) - 1; a.high++; @@ -183,7 +183,7 @@ ut_dulint_subtract( b -= a.low + 1; - a.low = 0xFFFFFFFF - b; + a.low = 0xFFFFFFFFUL - b; ut_ad(a.high > 0); @@ -214,7 +214,7 @@ ut_dulint_minus( ut_ad(a.high == b.high + 1); - diff = (ulint)(0xFFFFFFFF - b.low); + diff = (ulint)(0xFFFFFFFFUL - b.low); diff += 1 + a.low; ut_ad(diff > a.low); diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h index bec9cdd42b5..a155f68bd12 100644 --- a/innobase/include/ut0dbg.h +++ b/innobase/include/ut0dbg.h @@ -51,14 +51,13 @@ extern const char* ut_dbg_msg_stop; } while (0) #ifdef UNIV_DEBUG -# define ut_ad(EXPR) ut_a(EXPR) -# define ut_d(EXPR) do {EXPR;} while (0) +#define ut_ad(EXPR) ut_a(EXPR) +#define ut_d(EXPR) do {EXPR;} while (0) #else -# define ut_ad(EXPR) -# define ut_d(EXPR) +#define ut_ad(EXPR) +#define ut_d(EXPR) #endif #define UT_NOT_USED(A) A = A #endif - diff --git a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h index 7bee83b151e..2e02b3f0b6b 100644 --- a/innobase/include/ut0mem.h +++ b/innobase/include/ut0mem.h @@ -18,15 +18,15 @@ extern ulint ut_total_allocated_memory; UNIV_INLINE void* -ut_memcpy(void* dest, void* sour, ulint n); +ut_memcpy(void* dest, const void* sour, ulint n); UNIV_INLINE void* -ut_memmove(void* dest, void* sour, ulint n); +ut_memmove(void* dest, const void* sour, ulint n); UNIV_INLINE int -ut_memcmp(void* str1, void* str2, ulint n); +ut_memcmp(const void* str1, const void* str2, ulint n); /************************************************************************** @@ -50,6 +50,16 @@ ut_malloc( /* out, own: allocated memory */ ulint n); /* in: number of bytes to allocate */ /************************************************************************** +Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs +out. It cannot be used if we want to return an error message. Prints to +stderr a message if fails. */ + +ibool +ut_test_malloc( +/*===========*/ + /* out: TRUE if succeeded */ + ulint n); /* in: try to allocate this many bytes */ +/************************************************************************** Frees a memory bloock allocated with ut_malloc. */ void @@ -96,7 +106,7 @@ ut_free_all_mem(void); UNIV_INLINE char* -ut_strcpy(char* dest, char* sour); +ut_strcpy(char* dest, const char* sour); UNIV_INLINE ulint @@ -104,7 +114,7 @@ ut_strlen(const char* str); UNIV_INLINE int -ut_strcmp(void* str1, void* str2); +ut_strcmp(const void* str1, const void* str2); /************************************************************************** Determine the length of a string when it is quoted with ut_strcpyq(). */ diff --git a/innobase/include/ut0mem.ic b/innobase/include/ut0mem.ic index 951d9538424..3bb30a80f22 100644 --- a/innobase/include/ut0mem.ic +++ b/innobase/include/ut0mem.ic @@ -8,28 +8,28 @@ Created 5/30/1994 Heikki Tuuri UNIV_INLINE void* -ut_memcpy(void* dest, void* sour, ulint n) +ut_memcpy(void* dest, const void* sour, ulint n) { return(memcpy(dest, sour, n)); } UNIV_INLINE void* -ut_memmove(void* dest, void* sour, ulint n) +ut_memmove(void* dest, const void* sour, ulint n) { return(memmove(dest, sour, n)); } UNIV_INLINE int -ut_memcmp(void* str1, void* str2, ulint n) +ut_memcmp(const void* str1, const void* str2, ulint n) { return(memcmp(str1, str2, n)); } UNIV_INLINE char* -ut_strcpy(char* dest, char* sour) +ut_strcpy(char* dest, const char* sour) { return(strcpy(dest, sour)); } @@ -43,9 +43,9 @@ ut_strlen(const char* str) UNIV_INLINE int -ut_strcmp(void* str1, void* str2) +ut_strcmp(const void* str1, const void* str2) { - return(strcmp((char*)str1, (char*)str2)); + return(strcmp((const char*)str1, (const char*)str2)); } /************************************************************************** diff --git a/innobase/include/ut0rnd.h b/innobase/include/ut0rnd.h index c8ef0dd4001..aeec5d2f6eb 100644 --- a/innobase/include/ut0rnd.h +++ b/innobase/include/ut0rnd.h @@ -92,17 +92,17 @@ UNIV_INLINE ulint ut_fold_string( /*===========*/ - /* out: folded value */ - char* str); /* in: null-terminated string */ + /* out: folded value */ + const char* str); /* in: null-terminated string */ /***************************************************************** Folds a binary string. */ UNIV_INLINE ulint ut_fold_binary( /*===========*/ - /* out: folded value */ - byte* str, /* in: string of bytes */ - ulint len); /* in: length */ + /* out: folded value */ + const byte* str, /* in: string of bytes */ + ulint len); /* in: length */ /*************************************************************** Looks for a prime number slightly greater than the given argument. The prime is chosen so that it is not near any power of 2. */ diff --git a/innobase/include/ut0rnd.ic b/innobase/include/ut0rnd.ic index 5493c37404a..06d7012f60b 100644 --- a/innobase/include/ut0rnd.ic +++ b/innobase/include/ut0rnd.ic @@ -173,8 +173,8 @@ UNIV_INLINE ulint ut_fold_string( /*===========*/ - /* out: folded value */ - char* str) /* in: null-terminated string */ + /* out: folded value */ + const char* str) /* in: null-terminated string */ { #ifdef UNIV_DEBUG ulint i = 0; @@ -203,9 +203,9 @@ UNIV_INLINE ulint ut_fold_binary( /*===========*/ - /* out: folded value */ - byte* str, /* in: string of bytes */ - ulint len) /* in: length */ + /* out: folded value */ + const byte* str, /* in: string of bytes */ + ulint len) /* in: length */ { ulint i; ulint fold = 0; diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h index b349fe26015..f4a682c57ec 100644 --- a/innobase/include/ut0ut.h +++ b/innobase/include/ut0ut.h @@ -17,20 +17,6 @@ Created 1/20/1994 Heikki Tuuri typedef time_t ib_time_t; - -/************************************************************ -Uses vsprintf to emulate sprintf so that the function always returns -the printed length. Apparently in some old SCO Unixes sprintf did not -return the printed length but a pointer to the end of the printed string. */ - -ulint -ut_sprintf( -/*=======*/ - char* buf, /* in/out: buffer where to print */ - const char* format, /* in: format of prints */ - ...) /* in: arguments to be printed */ - __attribute__((__format__ (__printf__, 2, 3))); - /************************************************************ Gets the high 32 bits in a ulint. That is makes a shift >> 32, but since there seem to be compiler bugs in both gcc and Visual C++, @@ -141,7 +127,7 @@ void ut_ulint_sort(ulint* arr, ulint* aux_arr, ulint low, ulint high); /*============================================================*/ /************************************************************ -The following function returns a clock time in milliseconds. */ +The following function returns elapsed CPU time in milliseconds. */ ulint ut_clock(void); @@ -176,6 +162,14 @@ ut_sprintf_timestamp( /*=================*/ char* buf); /* in: buffer where to sprintf */ /************************************************************** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ + +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf); /* in: buffer where to sprintf */ +/************************************************************** Returns current year, month, day. */ void diff --git a/innobase/include/ut0ut.ic b/innobase/include/ut0ut.ic index 9d7dd283f29..9a0ef1c0d5b 100644 --- a/innobase/include/ut0ut.ic +++ b/innobase/include/ut0ut.ic @@ -110,7 +110,7 @@ ut_2pow_remainder( ulint n, /* in: number to be divided */ ulint m) /* in: divisor; power of 2 */ { - ut_ad(0x80000000 % m == 0); + ut_ad(0x80000000UL % m == 0); return(n & (m - 1)); } @@ -125,7 +125,7 @@ ut_2pow_round( ulint n, /* in: number to be rounded */ ulint m) /* in: divisor; power of 2 */ { - ut_ad(0x80000000 % m == 0); + ut_ad(0x80000000UL % m == 0); return(n & ~(m - 1)); } diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 68dd2aa18c1..92e8f224dea 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -402,10 +402,10 @@ lock_check_trx_id_sanity( fprintf(stderr, "\n" "InnoDB: is %lu %lu which is higher than the global trx id counter %lu %lu!\n" "InnoDB: The table is corrupt. You have to do dump + drop + reimport.\n", - ut_dulint_get_high(trx_id), - ut_dulint_get_low(trx_id), - ut_dulint_get_high(trx_sys->max_trx_id), - ut_dulint_get_low(trx_sys->max_trx_id)); + (ulong) ut_dulint_get_high(trx_id), + (ulong) ut_dulint_get_low(trx_id), + (ulong) ut_dulint_get_high(trx_sys->max_trx_id), + (ulong) ut_dulint_get_low(trx_sys->max_trx_id)); is_ok = FALSE; } @@ -1686,7 +1686,7 @@ lock_rec_enqueue_waiting( if (lock_print_waits) { fprintf(stderr, "Lock wait for trx %lu in index ", - ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_low(trx->id)); ut_print_name(stderr, index->name); } @@ -2029,7 +2029,7 @@ lock_grant( if (lock_print_waits) { fprintf(stderr, "Lock wait for trx %lu ends\n", - ut_dulint_get_low(lock->trx->id)); + (ulong) ut_dulint_get_low(lock->trx->id)); } /* If we are resolving a deadlock by choosing another transaction @@ -3597,7 +3597,8 @@ lock_release_off_kernel( ut_ad(lock_get_type(lock) & LOCK_TABLE); if (lock_get_mode(lock) != LOCK_IS - && (trx->insert_undo || trx->update_undo)) { + && 0 != ut_dulint_cmp(trx->undo_no, + ut_dulint_zero)) { /* The trx may have modified the table. We block the use of the MySQL query cache @@ -3820,7 +3821,7 @@ lock_table_print( fputs("TABLE LOCK table ", file); ut_print_name(file, lock->un_member.tab_lock.table->name); fprintf(file, " trx id %lu %lu", - (lock->trx)->id.high, (lock->trx)->id.low); + (ulong) (lock->trx)->id.high, (ulong) (lock->trx)->id.low); if (lock_get_mode(lock) == LOCK_S) { fputs(" lock mode S", file); @@ -3833,7 +3834,7 @@ lock_table_print( } else if (lock_get_mode(lock) == LOCK_AUTO_INC) { fputs(" lock mode AUTO-INC", file); } else { - fprintf(file, " unknown lock mode %lu", lock_get_mode(lock)); + fprintf(file, " unknown lock mode %lu", (ulong) lock_get_mode(lock)); } if (lock_get_wait(lock)) { @@ -3867,10 +3868,12 @@ lock_rec_print( page_no = lock->un_member.rec_lock.page_no; fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu ", - space, page_no, lock_rec_get_n_bits(lock)); + (ulong) space, (ulong) page_no, + (ulong) lock_rec_get_n_bits(lock)); dict_index_name_print(file, lock->index); fprintf(file, " trx id %lu %lu", - (lock->trx)->id.high, (lock->trx)->id.low); + (ulong) (lock->trx)->id.high, + (ulong) (lock->trx)->id.low); if (lock_get_mode(lock) == LOCK_S) { fputs(" lock mode S", file); @@ -3906,7 +3909,7 @@ lock_rec_print( page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL, BUF_GET_IF_IN_POOL, - IB__FILE__, __LINE__, &mtr); + __FILE__, __LINE__, &mtr); if (page) { page = buf_page_get_nowait(space, page_no, RW_S_LATCH, &mtr); @@ -3930,7 +3933,7 @@ lock_rec_print( if (lock_rec_get_nth_bit(lock, i)) { - fprintf(file, "Record lock, heap no %lu ", i); + fprintf(file, "Record lock, heap no %lu ", (ulong) i); if (page) { rec_print(file, @@ -4014,19 +4017,19 @@ lock_print_info( "------------\n", file); fprintf(file, "Trx id counter %lu %lu\n", - ut_dulint_get_high(trx_sys->max_trx_id), - ut_dulint_get_low(trx_sys->max_trx_id)); + (ulong) ut_dulint_get_high(trx_sys->max_trx_id), + (ulong) ut_dulint_get_low(trx_sys->max_trx_id)); fprintf(file, "Purge done for trx's n:o < %lu %lu undo n:o < %lu %lu\n", - ut_dulint_get_high(purge_sys->purge_trx_no), - ut_dulint_get_low(purge_sys->purge_trx_no), - ut_dulint_get_high(purge_sys->purge_undo_no), - ut_dulint_get_low(purge_sys->purge_undo_no)); + (ulong) ut_dulint_get_high(purge_sys->purge_trx_no), + (ulong) ut_dulint_get_low(purge_sys->purge_trx_no), + (ulong) ut_dulint_get_high(purge_sys->purge_undo_no), + (ulong) ut_dulint_get_low(purge_sys->purge_undo_no)); fprintf(file, "Total number of lock structs in row lock hash table %lu\n", - lock_get_n_rec_locks()); + (ulong) lock_get_n_rec_locks()); fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); @@ -4074,16 +4077,16 @@ loop: if (trx->read_view) { fprintf(file, "Trx read view will not see trx with id >= %lu %lu, sees < %lu %lu\n", - ut_dulint_get_high(trx->read_view->low_limit_id), - ut_dulint_get_low(trx->read_view->low_limit_id), - ut_dulint_get_high(trx->read_view->up_limit_id), - ut_dulint_get_low(trx->read_view->up_limit_id)); + (ulong) ut_dulint_get_high(trx->read_view->low_limit_id), + (ulong) ut_dulint_get_low(trx->read_view->low_limit_id), + (ulong) ut_dulint_get_high(trx->read_view->up_limit_id), + (ulong) ut_dulint_get_low(trx->read_view->up_limit_id)); } if (trx->que_state == TRX_QUE_LOCK_WAIT) { fprintf(file, "------- TRX HAS BEEN WAITING %lu SEC FOR THIS LOCK TO BE GRANTED:\n", - (ulint)difftime(time(NULL), trx->wait_started)); + (ulong)difftime(time(NULL), trx->wait_started)); if (lock_get_type(trx->wait_lock) == LOCK_REC) { lock_rec_print(file, trx->wait_lock); @@ -4373,7 +4376,7 @@ loop: rec = page_find_rec_with_heap_no(page, i); fprintf(stderr, - "Validating %lu %lu\n", space, page_no); + "Validating %lu %lu\n", (ulong) space, (ulong) page_no); lock_mutex_exit_kernel(); diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index 381d11e4cce..923ab448e07 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -24,7 +24,8 @@ Created 12/9/1995 Heikki Tuuri #include "trx0sys.h" #include "trx0trx.h" -/* Current free limit; protected by the log sys mutex; 0 means uninitialized */ +/* Current free limit of space 0; protected by the log sys mutex; 0 means +uninitialized */ ulint log_fsp_current_free_limit = 0; /* Global log system variable */ @@ -40,9 +41,11 @@ old */ ibool log_has_printed_chkp_warning = FALSE; time_t log_last_warning_time; +#ifdef UNIV_LOG_ARCHIVE /* Pointer to this variable is used as the i/o-message when we do i/o to an archive */ byte log_archive_io; +#endif /* UNIV_LOG_ARCHIVE */ /* A margin for free space in the log buffer before a log entry is catenated */ #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE) @@ -89,20 +92,14 @@ static void log_io_complete_checkpoint(void); /*============================*/ +#ifdef UNIV_LOG_ARCHIVE /********************************************************** Completes an archiving i/o. */ static void log_io_complete_archive(void); /*=========================*/ -/******************************************************************** -Tries to establish a big enough margin of free space in the log groups, such -that a new log entry can be catenated without an immediate need for a -archiving. */ -static -void -log_archive_margin(void); -/*====================*/ +#endif /* UNIV_LOG_ARCHIVE */ /******************************************************************** Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint, @@ -167,9 +164,13 @@ log_reserve_and_open( { log_t* log = log_sys; ulint len_upper_limit; +#ifdef UNIV_LOG_ARCHIVE ulint archived_lsn_age; - ulint count = 0; ulint dummy; +#endif /* UNIV_LOG_ARCHIVE */ +#ifdef UNIV_DEBUG + ulint count = 0; +#endif /* UNIV_DEBUG */ ut_a(len < log->buf_size / 2); loop: @@ -189,20 +190,18 @@ loop: log_buffer_flush_to_disk(); - count++; - - ut_ad(count < 50); + ut_ad(++count < 50); goto loop; } +#ifdef UNIV_LOG_ARCHIVE if (log->archiving_state != LOG_ARCH_OFF) { - archived_lsn_age = ut_dulint_minus(log->lsn, log->archived_lsn); - + archived_lsn_age = ut_dulint_minus(log->lsn, + log->archived_lsn); if (archived_lsn_age + len_upper_limit > log->max_archived_lsn_age) { - /* Not enough free archived space in log groups: do a synchronous archive write batch: */ @@ -212,13 +211,12 @@ loop: log_archive_do(TRUE, &dummy); - count++; - - ut_ad(count < 50); + ut_ad(++count < 50); goto loop; } } +#endif /* UNIV_LOG_ARCHIVE */ #ifdef UNIV_LOG_DEBUG log->old_buf_free = log->buf_free; @@ -357,7 +355,8 @@ log_close(void) "InnoDB: If you are using big BLOB or TEXT rows, you must set the\n" "InnoDB: combined size of log files at least 10 times bigger than the\n" "InnoDB: largest such row.\n", - checkpoint_age, log->log_group_capacity); + (ulong) checkpoint_age, + (ulong) log->log_group_capacity); } } @@ -385,6 +384,7 @@ function_exit: return(lsn); } +#ifdef UNIV_LOG_ARCHIVE /********************************************************** Pads the current log block full with dummy log records. Used in producing consistent archived log files. */ @@ -417,6 +417,7 @@ log_pad_current_log_block(void) ut_a((ut_dulint_get_low(lsn) % OS_FILE_LOG_BLOCK_SIZE) == LOG_BLOCK_HDR_SIZE); } +#endif /* UNIV_LOG_ARCHIVE */ /********************************************************** Calculates the data capacity of a log group, when the log file headers are not @@ -479,7 +480,8 @@ ulint log_group_calc_lsn_offset( /*======================*/ /* out: offset within the log group */ - dulint lsn, /* in: lsn, must be within 4 GB of group->lsn */ + dulint lsn, /* in: lsn, must be within 4 GB of + group->lsn */ log_group_t* group) /* in: log group */ { dulint gr_lsn; @@ -669,11 +671,13 @@ log_calc_max_ages(void) / LOG_POOL_CHECKPOINT_RATIO_ASYNC; log_sys->max_checkpoint_age = margin; +#ifdef UNIV_LOG_ARCHIVE log_sys->max_archived_lsn_age = smallest_archive_margin; log_sys->max_archived_lsn_age_async = smallest_archive_margin - smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC; +#endif /* UNIV_LOG_ARCHIVE */ failure: mutex_exit(&(log_sys->mutex)); @@ -773,7 +777,9 @@ log_init(void) memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE); /*----------------------------*/ - log_sys->archiving_state = LOG_ARCH_ON; +#ifdef UNIV_LOG_ARCHIVE + /* Under MySQL, log archiving is always off */ + log_sys->archiving_state = LOG_ARCH_OFF; log_sys->archived_lsn = log_sys->lsn; log_sys->next_archived_lsn = ut_dulint_zero; @@ -782,15 +788,18 @@ log_init(void) rw_lock_create(&(log_sys->archive_lock)); rw_lock_set_level(&(log_sys->archive_lock), SYNC_NO_ORDER_CHECK); - log_sys->archive_buf = ut_align( + log_sys->archive_buf = NULL; + + /* ut_align( ut_malloc(LOG_ARCHIVE_BUF_SIZE + OS_FILE_LOG_BLOCK_SIZE), - OS_FILE_LOG_BLOCK_SIZE); - log_sys->archive_buf_size = LOG_ARCHIVE_BUF_SIZE; + OS_FILE_LOG_BLOCK_SIZE); */ + log_sys->archive_buf_size = 0; - memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); + /* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */ log_sys->archiving_on = os_event_create(NULL); +#endif /* UNIV_LOG_ARCHIVE */ /*----------------------------*/ @@ -826,7 +835,8 @@ log_group_init( ulint space_id, /* in: space id of the file space which contains the log files of this group */ - ulint archive_space_id) /* in: space id of the file space + ulint archive_space_id __attribute__((unused))) + /* in: space id of the file space which contains some archived log files for this group; currently, only for the first log group this is @@ -848,7 +858,9 @@ log_group_init( group->n_pending_writes = 0; group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files); +#ifdef UNIV_LOG_ARCHIVE group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files); +#endif /* UNIV_LOG_ARCHIVE */ for (i = 0; i < n_files; i++) { *(group->file_header_bufs + i) = ut_align( @@ -858,17 +870,21 @@ log_group_init( memset(*(group->file_header_bufs + i), '\0', LOG_FILE_HDR_SIZE); +#ifdef UNIV_LOG_ARCHIVE *(group->archive_file_header_bufs + i) = ut_align( mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE), OS_FILE_LOG_BLOCK_SIZE); memset(*(group->archive_file_header_bufs + i), '\0', LOG_FILE_HDR_SIZE); +#endif /* UNIV_LOG_ARCHIVE */ } +#ifdef UNIV_LOG_ARCHIVE group->archive_space_id = archive_space_id; group->archived_file_no = 0; group->archived_offset = 0; +#endif /* UNIV_LOG_ARCHIVE */ group->checkpoint_buf = ut_align( mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE), @@ -931,7 +947,7 @@ log_group_check_flush_completion( if (!log_sys->one_flushed && group->n_pending_writes == 0) { if (log_debug_writes) { fprintf(stderr, - "Log flushed first to group %lu\n", group->id); + "Log flushed first to group %lu\n", (ulong) group->id); } log_sys->written_to_some_lsn = log_sys->write_lsn; @@ -942,7 +958,7 @@ log_group_check_flush_completion( if (log_debug_writes && (group->n_pending_writes == 0)) { - fprintf(stderr, "Log flushed to group %lu\n", group->id); + fprintf(stderr, "Log flushed to group %lu\n", (ulong) group->id); } return(0); @@ -1001,6 +1017,7 @@ log_io_complete( { ulint unlock; +#ifdef UNIV_LOG_ARCHIVE if ((byte*)group == &log_archive_io) { /* It was an archive write */ @@ -1008,6 +1025,7 @@ log_io_complete( return; } +#endif /* UNIV_LOG_ARCHIVE */ if ((ulint)group & 0x1UL) { /* It was a checkpoint write */ @@ -1089,7 +1107,7 @@ log_group_file_header_flush( if (log_debug_writes) { fprintf(stderr, "Writing log file header to group %lu file %lu\n", - group->id, nth_file); + (ulong) group->id, (ulong) nth_file); } if (log_do_write) { @@ -1168,7 +1186,8 @@ loop: if ((next_offset % group->file_size) + len > group->file_size) { - write_len = group->file_size - (next_offset % group->file_size); + write_len = group->file_size + - (next_offset % group->file_size); } else { write_len = len; } @@ -1179,11 +1198,12 @@ loop: "Writing log file segment to group %lu offset %lu len %lu\n" "start lsn %lu %lu\n" "First block n:o %lu last block n:o %lu\n", - group->id, next_offset, write_len, - ut_dulint_get_high(start_lsn), - ut_dulint_get_low(start_lsn), - log_block_get_hdr_no(buf), - log_block_get_hdr_no( + (ulong) group->id, (ulong) next_offset, + (ulong) write_len, + (ulong) ut_dulint_get_high(start_lsn), + (ulong) ut_dulint_get_low(start_lsn), + (ulong) log_block_get_hdr_no(buf), + (ulong) log_block_get_hdr_no( buf + write_len - OS_FILE_LOG_BLOCK_SIZE)); ut_a(log_block_get_hdr_no(buf) == log_block_convert_lsn_to_no(start_lsn)); @@ -1325,10 +1345,10 @@ loop: if (log_debug_writes) { fprintf(stderr, "Writing log from %lu %lu up to lsn %lu %lu\n", - ut_dulint_get_high(log_sys->written_to_all_lsn), - ut_dulint_get_low(log_sys->written_to_all_lsn), - ut_dulint_get_high(log_sys->lsn), - ut_dulint_get_low(log_sys->lsn)); + (ulong) ut_dulint_get_high(log_sys->written_to_all_lsn), + (ulong) ut_dulint_get_low(log_sys->written_to_all_lsn), + (ulong) ut_dulint_get_high(log_sys->lsn), + (ulong) ut_dulint_get_low(log_sys->lsn)); } log_sys->n_pending_writes++; @@ -1618,8 +1638,10 @@ log_group_checkpoint( log_group_t* group) /* in: log group */ { log_group_t* group2; +#ifdef UNIV_LOG_ARCHIVE dulint archived_lsn; dulint next_archived_lsn; +#endif /* UNIV_LOG_ARCHIVE */ ulint write_offset; ulint fold; byte* buf; @@ -1642,6 +1664,7 @@ log_group_checkpoint( mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size); +#ifdef UNIV_LOG_ARCHIVE if (log_sys->archiving_state == LOG_ARCH_OFF) { archived_lsn = ut_dulint_max; } else { @@ -1653,8 +1676,11 @@ log_group_checkpoint( /* For debugging only */ } } - + mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn); +#else /* UNIV_LOG_ARCHIVE */ + mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, ut_dulint_max); +#endif /* UNIV_LOG_ARCHIVE */ for (i = 0; i < LOG_MAX_N_GROUPS; i++) { log_checkpoint_set_nth_group_info(buf, i, 0, 0); @@ -1664,8 +1690,13 @@ log_group_checkpoint( while (group2) { log_checkpoint_set_nth_group_info(buf, group2->id, +#ifdef UNIV_LOG_ARCHIVE group2->archived_file_no, - group2->archived_offset); + group2->archived_offset +#else /* UNIV_LOG_ARCHIVE */ + 0, 0 +#endif /* UNIV_LOG_ARCHIVE */ + ); group2 = UT_LIST_GET_NEXT(log_groups, group2); } @@ -1716,7 +1747,7 @@ log_group_checkpoint( OS_FILE_LOG_BLOCK_SIZE, buf, ((byte*)group + 1)); - ut_ad(((ulint)group & 0x1) == 0); + ut_ad(((ulint)group & 0x1UL) == 0); } } @@ -1890,9 +1921,9 @@ log_checkpoint( if (log_debug_writes) { fprintf(stderr, "Making checkpoint no %lu at lsn %lu %lu\n", - ut_dulint_get_low(log_sys->next_checkpoint_no), - ut_dulint_get_high(oldest_lsn), - ut_dulint_get_low(oldest_lsn)); + (ulong) ut_dulint_get_low(log_sys->next_checkpoint_no), + (ulong) ut_dulint_get_high(oldest_lsn), + (ulong) ut_dulint_get_low(oldest_lsn)); } log_groups_write_checkpoint_info(); @@ -2089,16 +2120,18 @@ loop: len = group->file_size - (source_offset % group->file_size); } +#ifdef UNIV_LOG_ARCHIVE if (type == LOG_ARCHIVE) { log_sys->n_pending_archive_ios++; } +#endif /* UNIV_LOG_ARCHIVE */ log_sys->n_log_ios++; fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE, - len, buf, &log_archive_io); + len, buf, NULL); start_lsn = ut_dulint_add(start_lsn, len); buf += len; @@ -2109,6 +2142,7 @@ loop: } } +#ifdef UNIV_LOG_ARCHIVE /********************************************************** Generates an archived log file name. */ @@ -2121,7 +2155,7 @@ log_archived_file_name_gen( currently we only archive the first group */ ulint file_no)/* in: file number */ { - sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, file_no); + sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no); } /********************************************************** @@ -2252,7 +2286,6 @@ loop: log_archived_file_name_gen(name, group->id, group->archived_file_no + n_files); - fil_reserve_right_to_open(); file_handle = os_file_create(name, open_mode, OS_FILE_AIO, OS_DATA_FILE, &ret); @@ -2280,12 +2313,10 @@ loop: ut_a(ret); - fil_release_right_to_open(); - /* Add the archive file as a node to the space */ fil_node_create(name, group->file_size / UNIV_PAGE_SIZE, - group->archive_space_id); + group->archive_space_id, FALSE); if (next_offset % group->file_size == 0) { log_group_archive_file_header_write(group, n_files, @@ -2306,9 +2337,9 @@ loop: if (log_debug_writes) { fprintf(stderr, "Archiving starting at lsn %lu %lu, len %lu to group %lu\n", - ut_dulint_get_high(start_lsn), - ut_dulint_get_low(start_lsn), - len, group->id); + (ulong) ut_dulint_get_high(start_lsn), + (ulong) ut_dulint_get_low(start_lsn), + (ulong) len, (ulong) group->id); } log_sys->n_pending_archive_ios++; @@ -2403,7 +2434,7 @@ log_archive_write_complete_groups(void) if (log_debug_writes && trunc_files) { fprintf(stderr, "Complete file(s) archived to group %lu\n", - group->id); + (ulong) group->id); } /* Calculate the archive file space start lsn */ @@ -2538,7 +2569,7 @@ loop: start_lsn = log_sys->archived_lsn; if (calc_new_limit) { - ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0); limit_lsn = ut_dulint_add(start_lsn, log_sys->archive_buf_size); @@ -2595,10 +2626,10 @@ loop: if (log_debug_writes) { fprintf(stderr, "Archiving from lsn %lu %lu to lsn %lu %lu\n", - ut_dulint_get_high(log_sys->archived_lsn), - ut_dulint_get_low(log_sys->archived_lsn), - ut_dulint_get_high(limit_lsn), - ut_dulint_get_low(limit_lsn)); + (ulong) ut_dulint_get_high(log_sys->archived_lsn), + (ulong) ut_dulint_get_low(log_sys->archived_lsn), + (ulong) ut_dulint_get_high(limit_lsn), + (ulong) ut_dulint_get_low(limit_lsn)); } /* Read the log segment to the archive buffer */ @@ -2677,11 +2708,15 @@ log_archive_close_groups( ut_ad(mutex_own(&(log_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ + if (log_sys->archiving_state == LOG_ARCH_OFF) { + + return; + } + group = UT_LIST_GET_FIRST(log_sys->log_groups); trunc_len = UNIV_PAGE_SIZE * fil_space_get_size(group->archive_space_id); - if (trunc_len > 0) { ut_a(trunc_len == group->file_size); @@ -2701,7 +2736,8 @@ log_archive_close_groups( if (log_debug_writes) { fprintf(stderr, "Incrementing arch file no to %lu in log group %lu\n", - group->archived_file_no + 2, group->id); + (ulong) group->archived_file_no + 2, + (ulong) group->id); } } } @@ -2709,14 +2745,13 @@ log_archive_close_groups( /******************************************************************** Writes the log contents to the archive up to the lsn when this function was called, and stops the archiving. When archiving is started again, the archived -log file numbers start from 2 higher, so that the archiving will -not write again to the archived log files which exist when this function -returns. */ +log file numbers start from 2 higher, so that the archiving will not write +again to the archived log files which exist when this function returns. */ ulint log_archive_stop(void) /*==================*/ - /* out: DB_SUCCESS or DB_ERROR */ + /* out: DB_SUCCESS or DB_ERROR */ { ibool success; @@ -2732,7 +2767,7 @@ log_archive_stop(void) log_sys->archiving_state = LOG_ARCH_STOPPING; mutex_exit(&(log_sys->mutex)); - + log_archive_all(); mutex_enter(&(log_sys->mutex)); @@ -2753,7 +2788,7 @@ log_archive_stop(void) if appropriate */ log_archive_close_groups(TRUE); - + mutex_exit(&(log_sys->mutex)); /* Make a checkpoint, so that if recovery is needed, the file numbers @@ -2847,7 +2882,7 @@ log_archive_archivelog(void) log_sys->archiving_state = LOG_ARCH_ON; log_sys->archived_lsn = ut_dulint_align_down(log_sys->lsn, - OS_FILE_LOG_BLOCK_SIZE); + OS_FILE_LOG_BLOCK_SIZE); mutex_exit(&(log_sys->mutex)); return(DB_SUCCESS); @@ -2911,6 +2946,7 @@ loop: goto loop; } } +#endif /* UNIV_LOG_ARCHIVE */ /************************************************************************ Checks that there is enough free space in the log to start a new query step. @@ -2927,7 +2963,9 @@ loop: log_checkpoint_margin(); +#ifdef UNIV_LOG_ARCHIVE log_archive_margin(); +#endif /* UNIV_LOG_ARCHIVE */ mutex_enter(&(log_sys->mutex)); @@ -2989,9 +3027,12 @@ loop: mutex_enter(&(log_sys->mutex)); - if (log_sys->n_pending_archive_ios - + log_sys->n_pending_checkpoint_writes - + log_sys->n_pending_writes > 0) { + if ( +#ifdef UNIV_LOG_ARCHIVE + log_sys->n_pending_archive_ios || +#endif /* UNIV_LOG_ARCHIVE */ + log_sys->n_pending_checkpoint_writes || + log_sys->n_pending_writes) { mutex_exit(&(log_sys->mutex)); @@ -3005,7 +3046,9 @@ loop: goto loop; } +#ifdef UNIV_LOG_ARCHIVE log_archive_all(); +#endif /* UNIV_LOG_ARCHIVE */ log_make_checkpoint_at(ut_dulint_max, TRUE); mutex_enter(&(log_sys->mutex)); @@ -3013,25 +3056,31 @@ loop: lsn = log_sys->lsn; if (ut_dulint_cmp(lsn, log_sys->last_checkpoint_lsn) != 0 +#ifdef UNIV_LOG_ARCHIVE || (srv_log_archive_on && ut_dulint_cmp(lsn, ut_dulint_add(log_sys->archived_lsn, LOG_BLOCK_HDR_SIZE)) - != 0)) { + != 0) +#endif /* UNIV_LOG_ARCHIVE */ + ) { mutex_exit(&(log_sys->mutex)); goto loop; } - arch_log_no = - UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no; - + arch_log_no = 0; + +#ifdef UNIV_LOG_ARCHIVE + UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no; + if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) { - + arch_log_no--; } - + log_archive_close_groups(TRUE); +#endif /* UNIV_LOG_ARCHIVE */ mutex_exit(&(log_sys->mutex)); @@ -3080,10 +3129,24 @@ loop: ut_a(buf_all_freed()); ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn)); + if (ut_dulint_cmp(lsn, srv_start_lsn) < 0) { + fprintf(stderr, +"InnoDB: Error: log sequence number at shutdown %lu %lu\n" +"InnoDB: is lower than at startup %lu %lu!\n", + (ulong) ut_dulint_get_high(lsn), + (ulong) ut_dulint_get_low(lsn), + (ulong) ut_dulint_get_high(srv_start_lsn), + (ulong) ut_dulint_get_low(srv_start_lsn)); + } + + srv_shutdown_lsn = lsn; + fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); fil_flush_file_spaces(FIL_TABLESPACE); + fil_close_all_files(); + /* Make some checks that the server really is quiet */ ut_a(srv_n_threads_active[SRV_MASTER] == 0); ut_a(buf_all_freed()); @@ -3127,8 +3190,8 @@ log_check_log_recs( ut_memcpy(scan_buf, start, end - start); recv_scan_log_recs(TRUE, - buf_pool_get_curr_size() - - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + (buf_pool->n_frames - + recv_n_pool_free_frames) * UNIV_PAGE_SIZE, FALSE, scan_buf, end - start, ut_dulint_align_down(buf_start_lsn, OS_FILE_LOG_BLOCK_SIZE), @@ -3153,8 +3216,7 @@ log_peek_lsn( log system mutex */ dulint* lsn) /* out: if returns TRUE, current lsn is here */ { - if (0 == mutex_enter_nowait(&(log_sys->mutex), (char*)__FILE__, - __LINE__)) { + if (0 == mutex_enter_nowait(&(log_sys->mutex), __FILE__, __LINE__)) { *lsn = log_sys->lsn; mutex_exit(&(log_sys->mutex)); @@ -3182,12 +3244,12 @@ log_print( "Log sequence number %lu %lu\n" "Log flushed up to %lu %lu\n" "Last checkpoint at %lu %lu\n", - ut_dulint_get_high(log_sys->lsn), - ut_dulint_get_low(log_sys->lsn), - ut_dulint_get_high(log_sys->flushed_to_disk_lsn), - ut_dulint_get_low(log_sys->flushed_to_disk_lsn), - ut_dulint_get_high(log_sys->last_checkpoint_lsn), - ut_dulint_get_low(log_sys->last_checkpoint_lsn)); + (ulong) ut_dulint_get_high(log_sys->lsn), + (ulong) ut_dulint_get_low(log_sys->lsn), + (ulong) ut_dulint_get_high(log_sys->flushed_to_disk_lsn), + (ulong) ut_dulint_get_low(log_sys->flushed_to_disk_lsn), + (ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn), + (ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn)); current_time = time(NULL); @@ -3196,10 +3258,10 @@ log_print( fprintf(file, "%lu pending log writes, %lu pending chkp writes\n" "%lu log i/o's done, %.2f log i/o's/second\n", - log_sys->n_pending_writes, - log_sys->n_pending_checkpoint_writes, - log_sys->n_log_ios, - (log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed); + (ulong) log_sys->n_pending_writes, + (ulong) log_sys->n_pending_checkpoint_writes, + (ulong) log_sys->n_log_ios, + ((log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed)); log_sys->n_log_ios_old = log_sys->n_log_ios; log_sys->last_printout_time = current_time; diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index 51941a14656..7e57efcf9e1 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -17,6 +17,7 @@ Created 9/20/1997 Heikki Tuuri #include "buf0flu.h" #include "buf0rea.h" #include "srv0srv.h" +#include "srv0start.h" #include "mtr0mtr.h" #include "mtr0log.h" #include "page0page.h" @@ -33,6 +34,13 @@ Created 9/20/1997 Heikki Tuuri #include "dict0boot.h" #include "fil0fil.h" +#ifdef UNIV_HOTBACKUP +/* This is set to FALSE if the backup was originally taken with the +ibbackup --include regexp option: then we do not want to create tables in +directories which were not included */ +ibool recv_replay_file_ops = TRUE; +#endif /* UNIV_HOTBACKUP */ + /* Log records are stored in the hash table in chunks at most of this size; this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */ #define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t)) @@ -65,7 +73,11 @@ log scan */ ulint recv_scan_print_counter = 0; ibool recv_is_from_backup = FALSE; +#ifdef UNIV_HOTBACKUP ibool recv_is_making_a_backup = FALSE; +#else +# define recv_is_making_a_backup FALSE +#endif /* UNIV_HOTBACKUP */ ulint recv_previous_parsed_rec_type = 999999; ulint recv_previous_parsed_rec_offset = 0; @@ -73,6 +85,13 @@ ulint recv_previous_parsed_rec_is_multi = 0; ulint recv_max_parsed_page_no = 0; +/* This many frames must be left free in the buffer pool when we scan +the log and store the scanned log records in the buffer pool: we will +use these free frames to read in pages when we start applying the +log records to the database. */ + +ulint recv_n_pool_free_frames = 256; + /* The maximum lsn we see for a page during the recovery process. If this is bigger than the lsn we are able to scan up to, that is an indication that the recovery failed and the database may be corrupt. */ @@ -159,7 +178,8 @@ recv_sys_empty_hash(void) fprintf(stderr, "InnoDB: Error: %lu pages with log records were left unprocessed!\n" "InnoDB: Maximum page number with log records on it %lu\n", - recv_sys->n_addrs, recv_max_parsed_page_no); + (ulong) recv_sys->n_addrs, + (ulong) recv_max_parsed_page_no); ut_error; } @@ -297,7 +317,8 @@ recv_copy_group( /*============*/ log_group_t* up_to_date_group, /* in: the most up-to-date log group */ - log_group_t* group, /* in: copy to this log group */ + log_group_t* group, /* in: copy to this log + group */ dulint recovered_lsn) /* in: recovery succeeded up to this lsn */ { @@ -362,7 +383,8 @@ recv_synchronize_groups( /* Read the last recovered log block to the recovery system buffer: the block is always incomplete */ - start_lsn = ut_dulint_align_down(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); + start_lsn = ut_dulint_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); end_lsn = ut_dulint_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); ut_a(ut_dulint_cmp(start_lsn, end_lsn) != 0); @@ -418,7 +440,7 @@ recv_check_cp_is_consistent( fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); - if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(buf + LOG_CHECKPOINT_CHECKSUM_1)) { return(FALSE); } @@ -426,7 +448,7 @@ recv_check_cp_is_consistent( fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); - if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(buf + LOG_CHECKPOINT_CHECKSUM_2)) { return(FALSE); } @@ -470,8 +492,9 @@ recv_find_max_checkpoint( if (log_debug_writes) { fprintf(stderr, "InnoDB: Checkpoint in group %lu at %lu invalid, %lu\n", - group->id, field, - mach_read_from_4(buf + (ulong) group->id, + (ulong) field, + (ulong) mach_read_from_4(buf + LOG_CHECKPOINT_CHECKSUM_1)); } @@ -491,7 +514,8 @@ recv_find_max_checkpoint( if (log_debug_writes) { fprintf(stderr, "InnoDB: Checkpoint number %lu found in group %lu\n", - ut_dulint_get_low(checkpoint_no), group->id); + (ulong) ut_dulint_get_low(checkpoint_no), + (ulong) group->id); } if (ut_dulint_cmp(checkpoint_no, max_no) >= 0) { @@ -533,8 +557,8 @@ recv_read_cp_info_for_backup( byte* hdr, /* in: buffer containing the log group header */ dulint* lsn, /* out: checkpoint lsn */ ulint* offset, /* out: checkpoint offset in the log group */ - ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database - is running with < version 3.23.50 of InnoDB */ + ulint* fsp_limit,/* out: fsp limit of space 0, 1000000000 if the + database is running with < version 3.23.50 of InnoDB */ dulint* cp_no, /* out: checkpoint number */ dulint* first_header_lsn) /* out: lsn of of the start of the first log file */ @@ -679,7 +703,7 @@ recv_scan_log_seg_for_backup( < *scanned_checkpoint_no && *scanned_checkpoint_no - log_block_get_checkpoint_no(log_block) - > 0x80000000) { + > 0x80000000UL) { /* Garbage from a log buffer flush which was made before the most recent database recovery */ @@ -713,7 +737,7 @@ recv_scan_log_seg_for_backup( /*********************************************************************** Tries to parse a single log record body and also applies it to a page if -specified. */ +specified. File ops are parsed, but not applied in this function. */ static byte* recv_parse_or_apply_log_rec_body( @@ -787,8 +811,14 @@ recv_parse_or_apply_log_rec_body( } else if (type == MLOG_INIT_FILE_PAGE) { new_ptr = fsp_parse_init_file_page(ptr, end_ptr, page); - } else if (type <= MLOG_WRITE_STRING) { + } else if (type == MLOG_WRITE_STRING) { new_ptr = mlog_parse_string(ptr, end_ptr, page); + + } else if (type == MLOG_FILE_CREATE + || type == MLOG_FILE_RENAME + || type == MLOG_FILE_DELETE) { + new_ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, + ULINT_UNDEFINED); } else { new_ptr = NULL; @@ -875,9 +905,14 @@ recv_add_to_hash_table( recv_data_t* recv_data; recv_data_t** prev_field; recv_addr_t* recv_addr; - - ut_a(space == 0); /* For debugging; TODO: remove this */ + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) { + /* The tablespace does not exist any more: do not store the + log record */ + + return; + } + len = rec_end - body; recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t)); @@ -900,6 +935,9 @@ recv_add_to_hash_table( HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash, recv_fold(space, page_no), recv_addr); recv_sys->n_addrs++; + + /* fprintf(stderr, "Inserting log rec for space %lu, page %lu\n", + space, page_no); */ } UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv); @@ -1016,6 +1054,8 @@ recv_recover_page( return; } + /* fprintf(stderr, "Recovering space %lu, page %lu\n", space, page_no); */ + recv_addr->state = RECV_BEING_PROCESSED; mutex_exit(&(recv_sys->mutex)); @@ -1037,7 +1077,7 @@ recv_recover_page( success = buf_page_get_known_nowait(RW_X_LATCH, page, BUF_KEEP_OLD, - IB__FILE__, __LINE__, + __FILE__, __LINE__, &mtr); ut_a(success); @@ -1103,8 +1143,9 @@ recv_recover_page( if (log_debug_writes) { fprintf(stderr, "InnoDB: Applying log rec type %lu len %lu to space %lu page no %lu\n", - (ulint)recv->type, recv->len, recv_addr->space, - recv_addr->page_no); + (ulong) recv->type, (ulong) recv->len, + (ulong) recv_addr->space, + (ulong) recv_addr->page_no); } recv_parse_or_apply_log_rec_body(recv->type, buf, @@ -1257,9 +1298,9 @@ loop: if (recv_addr->state == RECV_NOT_PROCESSED) { if (!has_printed) { ut_print_timestamp(stderr); - fprintf(stderr, + fputs( " InnoDB: Starting an apply batch of log records to the database...\n" -"InnoDB: Progress in percents: "); +"InnoDB: Progress in percents: ",stderr); has_printed = TRUE; } @@ -1295,8 +1336,7 @@ loop: / hash_get_n_cells(recv_sys->addr_hash)) { fprintf(stderr, "%lu ", - (i * 100) / hash_get_n_cells(recv_sys->addr_hash)); - + (ulong) ((i * 100) / hash_get_n_cells(recv_sys->addr_hash))); } } @@ -1349,126 +1389,127 @@ loop: mutex_exit(&(recv_sys->mutex)); } +/* This page is allocated from the buffer pool and used in the function +below */ +page_t* recv_backup_application_page = NULL; + /*********************************************************************** Applies log records in the hash table to a backup. */ void -recv_apply_log_recs_for_backup( -/*===========================*/ - ulint n_data_files, /* in: number of data files */ - char** data_files, /* in: array containing the paths to the - data files */ - ulint* file_sizes) /* in: sizes of the data files in database - pages */ +recv_apply_log_recs_for_backup(void) +/*================================*/ { recv_addr_t* recv_addr; - os_file_t data_file; - ulint n_pages_total = 0; - ulint nth_file = 0; - ulint nth_page_in_file= 0; + ulint n_hash_cells; byte* page; + ulint actual_size; ibool success; + ulint error; ulint i; recv_sys->apply_log_recs = TRUE; recv_sys->apply_batch_on = TRUE; - page = buf_pool->frame_zero; - - for (i = 0; i < n_data_files; i++) { - n_pages_total += file_sizes[i]; + if (recv_backup_application_page == NULL) { + recv_backup_application_page = buf_frame_alloc(); } - if (recv_max_parsed_page_no >= n_pages_total) { - fprintf(stderr, -"InnoDB: Error: tablespace size %lu pages, but a log record on page %lu!\n" -"InnoDB: Are you sure you have specified all the ibdata files right in\n" -"InnoDB: the my.cnf file you gave as the argument to ibbackup --restore?\n", - n_pages_total, recv_max_parsed_page_no); - } + page = recv_backup_application_page; fputs( "InnoDB: Starting an apply batch of log records to the database...\n" "InnoDB: Progress in percents: ", stderr); - for (i = 0; i < n_pages_total; i++) { + n_hash_cells = hash_get_n_cells(recv_sys->addr_hash); - if (i == 0 || nth_page_in_file == file_sizes[nth_file]) { - if (i != 0) { - nth_file++; - nth_page_in_file = 0; - os_file_flush(data_file); - os_file_close(data_file); - } + for (i = 0; i < n_hash_cells; i++) { + /* The address hash table is externally chained */ + recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node; - data_file = os_file_create_simple(data_files[nth_file], - OS_FILE_OPEN, - OS_FILE_READ_WRITE, - &success); - if (!success) { + while (recv_addr != NULL) { + + if (!fil_tablespace_exists_in_mem(recv_addr->space)) { +/* fprintf(stderr, -"InnoDB: Error: cannot open %lu'th data file\n", nth_file); +"InnoDB: Warning: cannot apply log record to tablespace %lu page %lu,\n" +"InnoDB: because tablespace with that id does not exist.\n", + recv_addr->space, recv_addr->page_no); +*/ + recv_addr->state = RECV_PROCESSED; - exit(1); + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + goto skip_this_recv_addr; } - } - - recv_addr = recv_get_fil_addr_struct(0, i); - - if (recv_addr != NULL) { - success = os_file_read(data_file, page, - (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT) - & 0xFFFFFFFF, - nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT), - UNIV_PAGE_SIZE); + + /* We simulate a page read made by the buffer pool, to + make sure the recovery apparatus works ok, for + example, the buf_frame_align() function. We must init + the block corresponding to buf_pool->frame_zero + (== page). */ + + buf_page_init_for_backup_restore(recv_addr->space, + recv_addr->page_no, + buf_block_align(page)); + + /* Extend the tablespace's last file if the page_no + does not fall inside its bounds; we assume the last + file is auto-extending, and ibbackup copied the file + when it still was smaller */ + + success = fil_extend_space_to_desired_size( + &actual_size, + recv_addr->space, + recv_addr->page_no + 1); if (!success) { - fprintf(stderr, -"InnoDB: Error: cannot read page no %lu from %lu'th data file\n", - nth_page_in_file, nth_file); + fprintf(stderr, +"InnoDB: Fatal error: cannot extend tablespace %lu to hold %lu pages\n", + recv_addr->space, recv_addr->page_no); + + exit(1); + } + /* Read the page from the tablespace file using the + fil0fil.c routines */ + + error = fil_io(OS_FILE_READ, TRUE, recv_addr->space, + recv_addr->page_no, 0, UNIV_PAGE_SIZE, + page, NULL); + if (error != DB_SUCCESS) { + fprintf(stderr, +"InnoDB: Fatal error: cannot read from tablespace %lu page number %lu\n", + (ulong) recv_addr->space, (ulong) recv_addr->page_no); + exit(1); } - - /* We simulate a page read made by the buffer pool, - to make sure recovery works ok. We must init the - block corresponding to buf_pool->frame_zero - (== page) */ - buf_page_init_for_backup_restore(0, i, - buf_block_align(page)); + /* Apply the log records to this page */ + recv_recover_page(TRUE, FALSE, page, recv_addr->space, + recv_addr->page_no); - recv_recover_page(TRUE, FALSE, page, 0, i); + /* Write the page back to the tablespace file using the + fil0fil.c routines */ buf_flush_init_for_writing(page, mach_read_from_8(page + FIL_PAGE_LSN), - 0, i); - - success = os_file_write(data_files[nth_file], - data_file, page, - (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT) - & 0xFFFFFFFF, - nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT), - UNIV_PAGE_SIZE); - if (!success) { - fprintf(stderr, -"InnoDB: Error: cannot write page no %lu to %lu'th data file\n", - nth_page_in_file, nth_file); + recv_addr->space, recv_addr->page_no); - exit(1); - } + error = fil_io(OS_FILE_WRITE, TRUE, recv_addr->space, + recv_addr->page_no, 0, UNIV_PAGE_SIZE, + page, NULL); +skip_this_recv_addr: + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); } - if ((100 * i) / n_pages_total - != (100 * (i + 1)) / n_pages_total) { - fprintf(stderr, "%lu ", (100 * i) / n_pages_total); + if ((100 * i) / n_hash_cells + != (100 * (i + 1)) / n_hash_cells) { + fprintf(stderr, "%lu ", + (ulong) ((100 * i) / n_hash_cells)); fflush(stderr); } - - nth_page_in_file++; } - - os_file_flush(data_file); - os_file_close(data_file); recv_sys_empty_hash(); } @@ -1608,7 +1649,7 @@ recv_compare_spaces( frame = buf_page_get_gen(space1, page_no, RW_S_LATCH, NULL, BUF_GET_IF_IN_POOL, - IB__FILE__, __LINE__, + __FILE__, __LINE__, &mtr); if (frame) { #ifdef UNIV_SYNC_DEBUG @@ -1623,7 +1664,7 @@ recv_compare_spaces( frame = buf_page_get_gen(space2, page_no, RW_S_LATCH, NULL, BUF_GET_IF_IN_POOL, - IB__FILE__, __LINE__, + __FILE__, __LINE__, &mtr); if (frame) { #ifdef UNIV_SYNC_DEBUG @@ -1673,7 +1714,7 @@ recv_compare_spaces_low( recv_compare_spaces(space1, space2, n_pages); } -#endif +#endif /* UNIV_LOG_REPLICATE */ /*********************************************************************** Tries to parse a single log record and returns its length. */ @@ -1707,7 +1748,7 @@ recv_parse_log_rec( if (*ptr == MLOG_DUMMY_RECORD) { *type = *ptr; - *space = 1000; /* For debugging */ + *space = ULINT_UNDEFINED - 1; /* For debugging */ return(1); } @@ -1719,9 +1760,9 @@ recv_parse_log_rec( return(0); } - /* Check that space id and page_no are sensible */ + /* Check that page_no is sensible */ - if (*space != 0 || *page_no > 0x8FFFFFFF) { + if (*page_no > 0x8FFFFFFFUL) { recv_sys->found_corrupt_log = TRUE; @@ -1808,13 +1849,13 @@ recv_report_corrupt_log( "InnoDB: Log parsing proceeded successfully up to %lu %lu\n" "InnoDB: Previous log record type %lu, is multi %lu\n" "InnoDB: Recv offset %lu, prev %lu\n", - (ulint)type, space, page_no, - ut_dulint_get_high(recv_sys->recovered_lsn), - ut_dulint_get_low(recv_sys->recovered_lsn), - recv_previous_parsed_rec_type, - recv_previous_parsed_rec_is_multi, - (ulint)(ptr - recv_sys->buf), - recv_previous_parsed_rec_offset); + (ulong) type, (ulong) space, (ulong) page_no, + (ulong) ut_dulint_get_high(recv_sys->recovered_lsn), + (ulong) ut_dulint_get_low(recv_sys->recovered_lsn), + (ulong) recv_previous_parsed_rec_type, + (ulong) recv_previous_parsed_rec_is_multi, + (ulong) (ptr - recv_sys->buf), + (ulong) recv_previous_parsed_rec_offset); if ((ulint)(ptr - recv_sys->buf + 100) > recv_previous_parsed_rec_offset @@ -1888,12 +1929,16 @@ loop: single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG; if (single_rec || *ptr == MLOG_DUMMY_RECORD) { - /* The mtr only modified a single page */ + /* The mtr only modified a single page, or this is a file op */ old_lsn = recv_sys->recovered_lsn; + /* Try to parse a log record, fetching its type, space id, + page no, and a pointer to the body of the log record */ + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, &page_no, &body); + if (len == 0 || recv_sys->found_corrupt_log) { if (recv_sys->found_corrupt_log) { @@ -1925,12 +1970,36 @@ loop: if (log_debug_writes) { fprintf(stderr, "InnoDB: Parsed a single log rec type %lu len %lu space %lu page no %lu\n", - (ulint)type, len, space, page_no); + (ulong) type, (ulong) len, (ulong) space, + (ulong) page_no); } if (type == MLOG_DUMMY_RECORD) { /* Do nothing */ + } else if (store_to_hash && (type == MLOG_FILE_CREATE + || type == MLOG_FILE_RENAME + || type == MLOG_FILE_DELETE)) { +#ifdef UNIV_HOTBACKUP + if (recv_replay_file_ops) { + + /* In ibbackup --apply-log, replay an .ibd file + operation, if possible; note that + fil_path_to_mysql_datadir is set in ibbackup to + point to the datadir we should use there */ + + if (NULL == fil_op_log_parse_or_replay(body, + end_ptr, type, TRUE, space)) { + fprintf(stderr, +"InnoDB: Error: file op log record of type %lu space %lu not complete in\n" +"InnoDB: the replay phase. Path %s\n", (ulint)type, space, (char*)(body + 2)); + + ut_a(0); + } + } +#endif + /* In normal mysqld crash recovery we do not try to + replay file operations */ } else if (store_to_hash) { recv_add_to_hash_table(type, space, page_no, body, ptr + len, old_lsn, @@ -1941,11 +2010,13 @@ loop: becomes identical with the original page */ #ifdef UNIV_LOG_DEBUG recv_check_incomplete_log_recs(ptr, len); -#endif -/* recv_update_replicate(type, space, page_no, body, +#endif/* UNIV_LOG_DEBUG */ +#ifdef UNIV_LOG_REPLICATE + recv_update_replicate(type, space, page_no, body, ptr + len); recv_compare_replicate(space, page_no); -*/ +#endif /* UNIV_LOG_REPLICATE */ + } } else { /* Check that all the records associated with the single mtr @@ -1978,17 +2049,18 @@ loop: according to the log record */ #ifdef UNIV_LOG_DEBUG recv_check_incomplete_log_recs(ptr, len); -#endif -/* +#endif /* UNIV_LOG_DEBUG */ +#ifdef UNIV_LOG_REPLICATE recv_update_replicate(type, space, page_no, body, ptr + len); -*/ +#endif /* UNIV_LOG_REPLICATE */ } if (log_debug_writes) { fprintf(stderr, "InnoDB: Parsed a multi log rec type %lu len %lu space %lu page no %lu\n", - (ulint)type, len, space, page_no); + (ulong) type, (ulong) len, (ulong) space, + (ulong) page_no); } total_len += len; @@ -2047,12 +2119,13 @@ loop: recv_add_to_hash_table(type, space, page_no, body, ptr + len, old_lsn, new_recovered_lsn); +#ifdef UNIV_LOG_REPLICATE } else { /* In debug checking, check that the replicate page has become identical with the original page */ - -/* recv_compare_replicate(space, page_no); */ + recv_compare_replicate(space, page_no); +#endif /* UNIV_LOG_REPLICATE */ } ptr += len; @@ -2218,10 +2291,11 @@ recv_scan_log_recs( fprintf(stderr, "InnoDB: Log block no %lu at lsn %lu %lu has\n" "InnoDB: ok header, but checksum field contains %lu, should be %lu\n", - no, ut_dulint_get_high(scanned_lsn), - ut_dulint_get_low(scanned_lsn), - log_block_get_checksum(log_block), - log_block_calc_checksum(log_block)); + (ulong) no, + (ulong) ut_dulint_get_high(scanned_lsn), + (ulong) ut_dulint_get_low(scanned_lsn), + (ulong) log_block_get_checksum(log_block), + (ulong) log_block_calc_checksum(log_block)); } /* Garbage or an incompletely written log block */ @@ -2254,7 +2328,7 @@ recv_scan_log_recs( < recv_sys->scanned_checkpoint_no) && (recv_sys->scanned_checkpoint_no - log_block_get_checkpoint_no(log_block) - > 0x80000000)) { + > 0x80000000UL)) { /* Garbage from a log buffer flush which was made before the most recent database recovery */ @@ -2287,7 +2361,8 @@ recv_scan_log_recs( if (ut_dulint_cmp(scanned_lsn, recv_sys->scanned_lsn) > 0) { /* We were able to find more log data: add it to the - parsing buffer if parse_start_lsn is already non-zero */ + parsing buffer if parse_start_lsn is already + non-zero */ if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE >= RECV_PARSING_BUF_SIZE) { @@ -2325,8 +2400,8 @@ recv_scan_log_recs( fprintf(stderr, "InnoDB: Doing recovery: scanned up to log sequence number %lu %lu\n", - ut_dulint_get_high(*group_scanned_lsn), - ut_dulint_get_low(*group_scanned_lsn)); + (ulong) ut_dulint_get_high(*group_scanned_lsn), + (ulong) ut_dulint_get_low(*group_scanned_lsn)); } } @@ -2385,8 +2460,8 @@ recv_group_scan_log_recs( group, start_lsn, end_lsn); finished = recv_scan_log_recs(TRUE, - buf_pool_get_curr_size() - - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + (buf_pool->n_frames + - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE, start_lsn, contiguous_lsn, group_scanned_lsn); @@ -2396,9 +2471,9 @@ recv_group_scan_log_recs( if (log_debug_writes) { fprintf(stderr, "InnoDB: Scanned group %lu up to log sequence number %lu %lu\n", - group->id, - ut_dulint_get_high(*group_scanned_lsn), - ut_dulint_get_low(*group_scanned_lsn)); + (ulong) group->id, + (ulong) ut_dulint_get_high(*group_scanned_lsn), + (ulong) ut_dulint_get_low(*group_scanned_lsn)); } } @@ -2436,7 +2511,6 @@ recv_recovery_from_checkpoint_start( || (ut_dulint_cmp(limit_lsn, ut_dulint_max) == 0)); if (type == LOG_CHECKPOINT) { - recv_sys_create(); recv_sys_init(FALSE, buf_pool_get_curr_size()); } @@ -2450,8 +2524,6 @@ recv_recovery_from_checkpoint_start( return(DB_SUCCESS); } - sync_order_checks_on = TRUE; - recv_recovery_on = TRUE; recv_sys->limit_lsn = limit_lsn; @@ -2495,15 +2567,16 @@ recv_recovery_from_checkpoint_start( /* Wipe over the label now */ - ut_memcpy(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, - (char*)" ", 4); + memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + ' ', 4); /* Write to the log file to wipe over the label */ fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0, 0, OS_FILE_LOG_BLOCK_SIZE, log_hdr_buf, max_cp_group); } - + +#ifdef UNIV_LOG_ARCHIVE group = UT_LIST_GET_FIRST(log_sys->log_groups); while (group) { @@ -2513,6 +2586,7 @@ recv_recovery_from_checkpoint_start( group = UT_LIST_GET_NEXT(log_groups, group); } +#endif /* UNIV_LOG_ARCHIVE */ if (type == LOG_CHECKPOINT) { /* Start reading the log groups from the checkpoint lsn up. The @@ -2524,25 +2598,72 @@ recv_recovery_from_checkpoint_start( recv_sys->scanned_checkpoint_no = 0; recv_sys->recovered_lsn = checkpoint_lsn; - /* NOTE: we always do recovery at startup, but only if + srv_start_lsn = checkpoint_lsn; + + /* NOTE: we always do a 'recovery' at startup, but only if there is something wrong we will print a message to the user about recovery: */ if (ut_dulint_cmp(checkpoint_lsn, max_flushed_lsn) != 0 || ut_dulint_cmp(checkpoint_lsn, min_flushed_lsn) != 0) { + if (ut_dulint_cmp(checkpoint_lsn, max_flushed_lsn) + < 0) { + fprintf(stderr, +"InnoDB: ##########################################################\n" +"InnoDB: WARNING!\n" +"InnoDB: The log sequence number in ibdata files is higher\n" +"InnoDB: than the log sequence number in the ib_logfiles! Are you sure\n" +"InnoDB: you are using the right ib_logfiles to start up the database?\n" +"InnoDB: Log sequence number in ib_logfiles is %lu %lu, log\n" +"InnoDB: sequence numbers stamped to ibdata file headers are between\n" +"InnoDB: %lu %lu and %lu %lu.\n" +"InnoDB: ##########################################################\n", + (ulong) ut_dulint_get_high(checkpoint_lsn), + (ulong) ut_dulint_get_low(checkpoint_lsn), + (ulong) ut_dulint_get_high(min_flushed_lsn), + (ulong) ut_dulint_get_low(min_flushed_lsn), + (ulong) ut_dulint_get_high(max_flushed_lsn), + (ulong) ut_dulint_get_low(max_flushed_lsn)); + } + recv_needed_recovery = TRUE; ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Database was not shut down normally.\n" - "InnoDB: Starting recovery from log files...\n"); +" InnoDB: Database was not shut down normally!\n" +"InnoDB: Starting crash recovery.\n"); + + fprintf(stderr, +"InnoDB: Reading tablespace information from the .ibd files...\n"); + + fil_load_single_table_tablespaces(); + + /* If we are using the doublewrite method, we will + check if there are half-written pages in data files, + and restore them from the doublewrite buffer if + possible */ + + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + fprintf(stderr, +"InnoDB: Restoring possible half-written data pages from the doublewrite\n" +"InnoDB: buffer...\n"); + trx_sys_doublewrite_init_or_restore_pages( + TRUE); + } + + ut_print_timestamp(stderr); + fprintf(stderr, - "InnoDB: Starting log scan based on checkpoint at\n" - "InnoDB: log sequence number %lu %lu\n", - ut_dulint_get_high(checkpoint_lsn), - ut_dulint_get_low(checkpoint_lsn)); +" InnoDB: Starting log scan based on checkpoint at\n" +"InnoDB: log sequence number %lu %lu.\n", + (ulong) ut_dulint_get_high(checkpoint_lsn), + (ulong) ut_dulint_get_low(checkpoint_lsn)); + } else { + /* Init the doublewrite buffer memory structure */ + trx_sys_doublewrite_init_or_restore_pages(FALSE); } } @@ -2623,10 +2744,10 @@ recv_recovery_from_checkpoint_start( " InnoDB: ERROR: We were only able to scan the log up to\n" "InnoDB: %lu %lu, but a checkpoint was at %lu %lu.\n" "InnoDB: It is possible that the database is now corrupt!\n", - ut_dulint_get_high(group_scanned_lsn), - ut_dulint_get_low(group_scanned_lsn), - ut_dulint_get_high(checkpoint_lsn), - ut_dulint_get_low(checkpoint_lsn)); + (ulong) ut_dulint_get_high(group_scanned_lsn), + (ulong) ut_dulint_get_low(group_scanned_lsn), + (ulong) ut_dulint_get_high(checkpoint_lsn), + (ulong) ut_dulint_get_low(checkpoint_lsn)); } if (ut_dulint_cmp(group_scanned_lsn, recv_max_page_lsn) < 0) { @@ -2635,10 +2756,10 @@ recv_recovery_from_checkpoint_start( " InnoDB: ERROR: We were only able to scan the log up to %lu %lu\n" "InnoDB: but a database page a had an lsn %lu %lu. It is possible that the\n" "InnoDB: database is now corrupt!\n", - ut_dulint_get_high(group_scanned_lsn), - ut_dulint_get_low(group_scanned_lsn), - ut_dulint_get_high(recv_max_page_lsn), - ut_dulint_get_low(recv_max_page_lsn)); + (ulong) ut_dulint_get_high(group_scanned_lsn), + (ulong) ut_dulint_get_low(group_scanned_lsn), + (ulong) ut_dulint_get_high(recv_max_page_lsn), + (ulong) ut_dulint_get_low(recv_max_page_lsn)); } if (ut_dulint_cmp(recv_sys->recovered_lsn, checkpoint_lsn) < 0) { @@ -2661,9 +2782,26 @@ recv_recovery_from_checkpoint_start( log_sys->next_checkpoint_lsn = checkpoint_lsn; log_sys->next_checkpoint_no = ut_dulint_add(checkpoint_no, 1); +#ifdef UNIV_LOG_ARCHIVE log_sys->archived_lsn = archived_lsn; +#endif /* UNIV_LOG_ARCHIVE */ recv_synchronize_groups(up_to_date_group); + + if (!recv_needed_recovery) { + if (ut_dulint_cmp(checkpoint_lsn, recv_sys->recovered_lsn) + != 0) { + fprintf(stderr, +"InnoDB: Warning: we did not need to do crash recovery, but log scan\n" +"InnoDB: progressed past the checkpoint lsn %lu %lu up to lsn %lu %lu\n", + (ulong) ut_dulint_get_high(checkpoint_lsn), + (ulong) ut_dulint_get_low(checkpoint_lsn), + (ulong) ut_dulint_get_high(recv_sys->recovered_lsn), + (ulong) ut_dulint_get_low(recv_sys->recovered_lsn)); + } + } else { + srv_start_lsn = recv_sys->recovered_lsn; + } log_sys->lsn = recv_sys->recovered_lsn; @@ -2679,10 +2817,12 @@ recv_recovery_from_checkpoint_start( log_sys->next_checkpoint_no = ut_dulint_add(checkpoint_no, 1); +#ifdef UNIV_LOG_ARCHIVE if (ut_dulint_cmp(archived_lsn, ut_dulint_max) == 0) { log_sys->archiving_state = LOG_ARCH_OFF; } +#endif /* UNIV_LOG_ARCHIVE */ mutex_enter(&(recv_sys->mutex)); @@ -2692,8 +2832,6 @@ recv_recovery_from_checkpoint_start( mutex_exit(&(log_sys->mutex)); - sync_order_checks_on = FALSE; - recv_lsn_checks_on = TRUE; /* The database is now ready to start almost normal processing of user @@ -2761,7 +2899,9 @@ recv_reset_logs( dulint lsn, /* in: reset to this lsn rounded up to be divisible by OS_FILE_LOG_BLOCK_SIZE, after which we add LOG_BLOCK_HDR_SIZE */ +#ifdef UNIV_LOG_ARCHIVE ulint arch_log_no, /* in: next archived log file number */ +#endif /* UNIV_LOG_ARCHIVE */ ibool new_logs_created)/* in: TRUE if resetting logs is done at the log creation; FALSE if it is done after archive recovery */ @@ -2778,9 +2918,10 @@ recv_reset_logs( while (group) { group->lsn = log_sys->lsn; group->lsn_offset = LOG_FILE_HDR_SIZE; - +#ifdef UNIV_LOG_ARCHIVE group->archived_file_no = arch_log_no; group->archived_offset = 0; +#endif /* UNIV_LOG_ARCHIVE */ if (!new_logs_created) { recv_truncate_group(group, group->lsn, group->lsn, @@ -2797,7 +2938,9 @@ recv_reset_logs( log_sys->next_checkpoint_no = ut_dulint_zero; log_sys->last_checkpoint_lsn = ut_dulint_zero; +#ifdef UNIV_LOG_ARCHIVE log_sys->archived_lsn = log_sys->lsn; +#endif /* UNIV_LOG_ARCHIVE */ log_block_init(log_sys->buf, log_sys->lsn); log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); @@ -2815,17 +2958,18 @@ recv_reset_logs( mutex_enter(&(log_sys->mutex)); } +#ifdef UNIV_HOTBACKUP /********************************************************** Creates new log files after a backup has been restored. */ void recv_reset_log_files_for_backup( /*============================*/ - char* log_dir, /* in: log file directory path */ - ulint n_log_files, /* in: number of log files */ - ulint log_file_size, /* in: log file size */ - dulint lsn) /* in: new start lsn, must be divisible by - OS_FILE_LOG_BLOCK_SIZE */ + const char* log_dir, /* in: log file directory path */ + ulint n_log_files, /* in: number of log files */ + ulint log_file_size, /* in: log file size */ + dulint lsn) /* in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ { os_file_t log_file; ibool success; @@ -2833,8 +2977,8 @@ recv_reset_log_files_for_backup( ulint i; ulint log_dir_len; char* name; - static - char logfilename[] = "ib_logfile"; + static const + char logfilename[] = "ib_logfile"; log_dir_len = strlen(log_dir); /* reserve space for log_dir, "ib_logfile" and a number */ @@ -2843,10 +2987,12 @@ recv_reset_log_files_for_backup( memcpy(name + log_dir_len, logfilename, sizeof logfilename); buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + memset(buf, LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE, '\0'); + for (i = 0; i < n_log_files; i++) { - sprintf(name + log_dir_len + sizeof logfilename, "%lu", i); + sprintf(name + log_dir_len + sizeof logfilename, "%lu", (ulong) i); log_file = os_file_create_simple(name, OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); @@ -2859,17 +3005,17 @@ recv_reset_log_files_for_backup( fprintf(stderr, "Setting log file size to %lu %lu\n", - ut_get_high32(log_file_size), - log_file_size & 0xFFFFFFFF); + (ulong) ut_get_high32(log_file_size), + (ulong) log_file_size & 0xFFFFFFFFUL); success = os_file_set_size(name, log_file, - log_file_size & 0xFFFFFFFF, + log_file_size & 0xFFFFFFFFUL, ut_get_high32(log_file_size)); if (!success) { fprintf(stderr, -"InnoDB: Cannot set %s size to %lu %lu\n", name, ut_get_high32(log_file_size), - log_file_size & 0xFFFFFFFF); +"InnoDB: Cannot set %s size to %lu %lu\n", name, (ulong) ut_get_high32(log_file_size), + (ulong) (log_file_size & 0xFFFFFFFFUL)); exit(1); } @@ -2902,7 +3048,9 @@ recv_reset_log_files_for_backup( mem_free(name); ut_free(buf); } +#endif /* UNIV_HOTBACKUP */ +#ifdef UNIV_LOG_ARCHIVE /********************************************************** Reads from the archive of a log group and performs recovery. */ static @@ -2927,6 +3075,8 @@ log_group_recover_from_archive_file( int input_char; char name[10000]; + ut_a(0); + try_open_again: buf = log_sys->buf; @@ -2934,13 +3084,10 @@ try_open_again: log_archived_file_name_gen(name, group->id, group->archived_file_no); - fil_reserve_right_to_open(); - file_handle = os_file_create(name, OS_FILE_OPEN, OS_FILE_LOG, OS_FILE_AIO, &ret); if (ret == FALSE) { - fil_release_right_to_open(); ask_again: fprintf(stderr, "InnoDB: Do you want to copy additional archived log files\n" @@ -2981,12 +3128,10 @@ ask_again: ut_a(ret); - fil_release_right_to_open(); - /* Add the archive file as a node to the space */ fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE, - group->archive_space_id); + group->archive_space_id, FALSE); ut_a(RECV_SCAN_SIZE >= LOG_FILE_HDR_SIZE); /* Read the archive file header */ @@ -3052,9 +3197,9 @@ ask_again: if (log_debug_writes) { fprintf(stderr, "InnoDB: Archive read starting at lsn %lu %lu, len %lu from file %s\n", - ut_dulint_get_high(start_lsn), - ut_dulint_get_low(start_lsn), - len, name); + (ulong) ut_dulint_get_high(start_lsn), + (ulong) ut_dulint_get_low(start_lsn), + (ulong) len, name); } fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, @@ -3062,8 +3207,8 @@ ask_again: read_offset % UNIV_PAGE_SIZE, len, buf, NULL); ret = recv_scan_log_recs(TRUE, - buf_pool_get_curr_size() - - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + (buf_pool->n_frames - + recv_n_pool_free_frames) * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn, &dummy_lsn, &scanned_lsn); @@ -3109,10 +3254,10 @@ recv_recovery_from_archive_start( ibool ret; ulint err; + ut_a(0); + recv_sys_create(); recv_sys_init(FALSE, buf_pool_get_curr_size()); - - sync_order_checks_on = TRUE; recv_recovery_on = TRUE; recv_recovery_from_backup_on = TRUE; @@ -3135,7 +3280,7 @@ recv_recovery_from_archive_start( if (!group) { fprintf(stderr, "InnoDB: There is no log group defined with id %lu!\n", - group_id); + (ulong) group_id); return(DB_ERROR); } @@ -3199,8 +3344,6 @@ recv_recovery_from_archive_start( mutex_exit(&(log_sys->mutex)); - sync_order_checks_on = FALSE; - return(DB_SUCCESS); } @@ -3215,3 +3358,4 @@ recv_recovery_from_archive_finish(void) recv_recovery_from_backup_on = FALSE; } +#endif /* UNIV_LOG_ARCHIVE */ diff --git a/innobase/mach/mach0data.c b/innobase/mach/mach0data.c index 336ce106a75..ff7265b34f4 100644 --- a/innobase/mach/mach0data.c +++ b/innobase/mach/mach0data.c @@ -36,37 +36,37 @@ mach_parse_compressed( flag = mach_read_from_1(ptr); - if (flag < 0x80) { + if (flag < 0x80UL) { *val = flag; return(ptr + 1); - } else if (flag < 0xC0) { + } else if (flag < 0xC0UL) { if (end_ptr < ptr + 2) { return(NULL); } - *val = mach_read_from_2(ptr) & 0x7FFF; + *val = mach_read_from_2(ptr) & 0x7FFFUL; return(ptr + 2); - } else if (flag < 0xE0) { + } else if (flag < 0xE0UL) { if (end_ptr < ptr + 3) { return(NULL); } - *val = mach_read_from_3(ptr) & 0x3FFFFF; + *val = mach_read_from_3(ptr) & 0x3FFFFFUL; return(ptr + 3); - } else if (flag < 0xF0) { + } else if (flag < 0xF0UL) { if (end_ptr < ptr + 4) { return(NULL); } - *val = mach_read_from_4(ptr) & 0x1FFFFFFF; + *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL; return(ptr + 4); } else { - ut_ad(flag == 0xF0); + ut_ad(flag == 0xF0UL); if (end_ptr < ptr + 5) { return(NULL); diff --git a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c index 5ce7bc9f47d..ea8c296f8cf 100644 --- a/innobase/mem/mem0dbg.c +++ b/innobase/mem/mem0dbg.c @@ -34,7 +34,7 @@ struct mem_hash_node_struct { UT_LIST_NODE_T(mem_hash_node_t) list; /* hash list node */ mem_heap_t* heap; /* memory heap */ - char* file_name;/* file where heap was created*/ + const char* file_name;/* file where heap was created*/ ulint line; /* file line of creation */ ulint nth_heap;/* this is the nth heap created */ UT_LIST_NODE_T(mem_hash_node_t) @@ -267,7 +267,7 @@ void mem_hash_insert( /*============*/ mem_heap_t* heap, /* in: the created heap */ - char* file_name, /* in: file name of creation */ + const char* file_name, /* in: file name of creation */ ulint line) /* in: line where created */ { mem_hash_node_t* new_node; @@ -310,7 +310,7 @@ void mem_hash_remove( /*============*/ mem_heap_t* heap, /* in: the heap to be freed */ - char* file_name, /* in: file name of freeing */ + const char* file_name, /* in: file name of freeing */ ulint line) /* in: line where freed */ { mem_hash_node_t* node; @@ -339,7 +339,7 @@ mem_hash_remove( if (node == NULL) { fprintf(stderr, "Memory heap or buffer freed in %s line %lu did not exist.\n", - file_name, line); + file_name, (ulong) line); ut_error; } @@ -356,15 +356,13 @@ mem_hash_remove( "Inconsistency in memory heap or buffer n:o %lu created\n" "in %s line %lu and tried to free in %s line %lu.\n" "Hex dump of 400 bytes around memory heap first block start:\n", - node->nth_heap, node->file_name, node->line, - file_name, line); + node->nth_heap, node->file_name, (ulong) node->line, + file_name, (ulong) line); ut_print_buf(stderr, (byte*)node->heap - 200, 400); - fputs("\nDump of the mem heap:\n", stderr); - mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, - &size, NULL, NULL); - ut_error; + &size, NULL, NULL); + ut_error; } /* Free the memory occupied by the node struct */ @@ -446,6 +444,9 @@ mem_heap_validate_or_print( if ((block->type == MEM_HEAP_BUFFER) && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) { + fprintf(stderr, +"InnoDB: Error: mem block %lx length %lu > UNIV_PAGE_SIZE\n", (ulong) block, + (ulong) mem_block_get_len(block)); /* error */ return; @@ -485,6 +486,12 @@ mem_heap_validate_or_print( mem_field_trailer_get_check(user_field)) { /* error */ + fprintf(stderr, +"InnoDB: Error: block %lx mem field %lx len %lu\n" +"InnoDB: header check field is %lx but trailer %lx\n", (ulint)block, + (ulint)field, len, check_field, + mem_field_trailer_get_check(user_field)); + return; } @@ -504,6 +511,11 @@ mem_heap_validate_or_print( if (field != (byte*)block + mem_block_get_free(block)) { /* error */ + fprintf(stderr, +"InnoDB: Error: block %lx end of mem fields %lx\n" +"InnoDB: but block free at %lx\n", (ulint)block, (ulint)field, + (ulint)((byte*)block + mem_block_get_free(block))); + return; } @@ -546,7 +558,8 @@ mem_heap_print( &us_size, &phys_size, &n_blocks); fprintf(stderr, "\nheap type: %lu; size: user size %lu; physical size %lu; blocks %lu.\n", - heap->type, us_size, phys_size, n_blocks); + (ulong) heap->type, (ulong) us_size, + (ulong) phys_size, (ulong) n_blocks); ut_a(!error); } @@ -582,6 +595,10 @@ mem_heap_validate( mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size, &phys_size, &n_blocks); + if (error) { + mem_heap_print(heap); + } + ut_a(!error); return(TRUE); @@ -737,8 +754,8 @@ mem_analyze_corruption( if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) { fprintf(stderr, "Mem block at - %lu, file %s, line %lu\n", - dist, p + sizeof(ulint), - *(ulint*)(p + 8 + sizeof(ulint))); + (ulong) dist, (p + sizeof(ulint)), + (ulong) (*(ulint*)(p + 8 + sizeof(ulint)))); break; } @@ -746,8 +763,8 @@ mem_analyze_corruption( if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) { fprintf(stderr, "Freed mem block at - %lu, file %s, line %lu\n", - dist, p + sizeof(ulint), - *(ulint*)(p + 8 + sizeof(ulint))); + (ulong) dist, (p + sizeof(ulint)), + (ulong) (*(ulint*)(p + 8 + sizeof(ulint)))); break; } @@ -774,8 +791,8 @@ mem_analyze_corruption( if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) { fprintf(stderr, "Mem block at + %lu, file %s, line %lu\n", - dist, p + sizeof(ulint), - *(ulint*)(p + 8 + sizeof(ulint))); + (ulong) dist, (p + sizeof(ulint)), + (ulong) (*(ulint*)(p + 8 + sizeof(ulint)))); break; } @@ -783,8 +800,8 @@ mem_analyze_corruption( if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) { fprintf(stderr, "Freed mem block at + %lu, file %s, line %lu\n", - dist, p + sizeof(ulint), - *(ulint*)(p + 8 + sizeof(ulint))); + (ulong) dist, (p + sizeof(ulint)), + (ulong) (*(ulint*)(p + 8 + sizeof(ulint)))); break; } diff --git a/innobase/mem/mem0mem.c b/innobase/mem/mem0mem.c index e1b9a762381..c090b25a632 100644 --- a/innobase/mem/mem0mem.c +++ b/innobase/mem/mem0mem.c @@ -92,11 +92,11 @@ with mem_free. */ void* mem_alloc_func_noninline( /*=====================*/ - /* out, own: free storage, NULL if did not - succeed */ - ulint n, /* in: desired number of bytes */ - char* file_name, /* in: file name where created */ - ulint line /* in: line where created */ + /* out, own: free storage, + NULL if did not succeed */ + ulint n, /* in: desired number of bytes */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ ) { return(mem_alloc_func(n, file_name, line)); @@ -108,18 +108,18 @@ Creates a memory heap block where data can be allocated. */ mem_block_t* mem_heap_create_block( /*==================*/ - /* out, own: memory heap block, NULL if did not - succeed */ - mem_heap_t* heap,/* in: memory heap or NULL if first block should - be created */ - ulint n, /* in: number of bytes needed for user data, or - if init_block is not NULL, its size in bytes */ - void* init_block, /* in: init block in fast create, type must be - MEM_HEAP_DYNAMIC */ - ulint type, /* in: type of heap: MEM_HEAP_DYNAMIC, or - MEM_HEAP_BUFFER possibly ORed to MEM_HEAP_BTR_SEARCH */ - char* file_name,/* in: file name where created */ - ulint line) /* in: line where created */ + /* out, own: memory heap block, + NULL if did not succeed */ + mem_heap_t* heap, /* in: memory heap or NULL if first block + should be created */ + ulint n, /* in: number of bytes needed for user data, or + if init_block is not NULL, its size in bytes */ + void* init_block, /* in: init block in fast create, + type must be MEM_HEAP_DYNAMIC */ + ulint type, /* in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ + const char* file_name,/* in: file name where created */ + ulint line) /* in: line where created */ { mem_block_t* block; ulint len; diff --git a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c index cd75728c937..023369e8ec5 100644 --- a/innobase/mem/mem0pool.c +++ b/innobase/mem/mem0pool.c @@ -277,7 +277,8 @@ mem_pool_fill_free_list( fprintf(stderr, " InnoDB: Error: mem pool free list %lu length is %lu\n" "InnoDB: though the list is empty!\n", - i + 1, UT_LIST_GET_LEN(pool->free_list[i + 1])); + (ulong) i + 1, + (ulong) UT_LIST_GET_LEN(pool->free_list[i + 1])); } ret = mem_pool_fill_free_list(i + 1, pool); @@ -358,7 +359,7 @@ mem_area_alloc( fprintf(stderr, "InnoDB: Error: Removing element from mem pool free list %lu though the\n" "InnoDB: element is not marked free!\n", - n); + (ulong) n); mem_analyze_corruption((byte*)area); @@ -378,7 +379,7 @@ mem_area_alloc( fprintf(stderr, "InnoDB: Error: Removing element from mem pool free list %lu\n" "InnoDB: though the list length is 0!\n", - n); + (ulong) n); mem_analyze_corruption((byte*)area); ut_error; @@ -498,7 +499,7 @@ mem_area_free( fprintf(stderr, "InnoDB: Error: Memory area size %lu, next area size %lu not a power of 2!\n" "InnoDB: Possibly a memory overrun of the buffer being freed here.\n", - size, next_size); + (ulong) size, (ulong) next_size); mem_analyze_corruption((byte*)area); ut_error; @@ -597,8 +598,8 @@ mem_pool_validate( } } - ut_a(free + pool->reserved == pool->size - - (pool->size % MEM_AREA_MIN_SIZE)); + ut_a(free + pool->reserved == pool->size); + mutex_exit(&(pool->mutex)); return(TRUE); @@ -626,13 +627,13 @@ mem_pool_print_info( fprintf(outfile, "Free list length %lu for blocks of size %lu\n", - UT_LIST_GET_LEN(pool->free_list[i]), - ut_2_exp(i)); + (ulong) UT_LIST_GET_LEN(pool->free_list[i]), + (ulong) ut_2_exp(i)); } } - fprintf(outfile, "Pool size %lu, reserved %lu.\n", pool->size, - pool->reserved); + fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size, + (ulong) pool->reserved); mutex_exit(&(pool->mutex)); } diff --git a/innobase/mtr/mtr0mtr.c b/innobase/mtr/mtr0mtr.c index 0106238b952..6e918806eb1 100644 --- a/innobase/mtr/mtr0mtr.c +++ b/innobase/mtr/mtr0mtr.c @@ -329,6 +329,6 @@ mtr_print( { fprintf(stderr, "Mini-transaction handle: memo size %lu bytes log size %lu bytes\n", - dyn_array_get_data_size(&(mtr->memo)), - dyn_array_get_data_size(&(mtr->log))); + (ulong) dyn_array_get_data_size(&(mtr->memo)), + (ulong) dyn_array_get_data_size(&(mtr->log))); } diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 8cb2b171328..d5ca8f927c6 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -11,6 +11,7 @@ Created 10/21/1995 Heikki Tuuri #include "os0thread.h" #include "ut0mem.h" #include "srv0srv.h" +#include "srv0start.h" #include "fil0fil.h" #include "buf0buf.h" @@ -33,7 +34,7 @@ ulint os_innodb_umask = 0; #endif /* If the following is set to TRUE, we do not call os_file_flush in every -os_file_write. We can set this TRUE if the doublewrite buffer is used. */ +os_file_write. We can set this TRUE when the doublewrite buffer is used. */ ibool os_do_not_call_flush_at_each_write = FALSE; /* We use these mutexes to protect lseek + file i/o operation, if the @@ -69,7 +70,7 @@ struct os_aio_slot_struct{ bytes */ ulint offset_high; /* 32 high bits of file offset */ os_file_t file; /* file where to read or write */ - char* name; /* file name or path */ + const char* name; /* file name or path */ ibool io_already_done;/* used only in simulated aio: TRUE if the physical i/o already made and only the slot message @@ -154,7 +155,6 @@ os_mutex_t os_file_count_mutex; ulint os_file_n_pending_preads = 0; ulint os_file_n_pending_pwrites = 0; - /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -198,9 +198,12 @@ overwrite the error number). If the number is not known to this program, the OS error number + 100 is returned. */ ulint -os_file_get_last_error(void) -/*========================*/ - /* out: error number, or OS error number + 100 */ +os_file_get_last_error( +/*===================*/ + /* out: error number, or OS error + number + 100 */ + ibool report_all_errors) /* in: TRUE if we want an error message + printed of all errors */ { ulint err; @@ -208,25 +211,29 @@ os_file_get_last_error(void) err = (ulint) GetLastError(); - if (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS) { + if (report_all_errors + || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) { + ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %lu in a file operation.\n" - "InnoDB: See http://www.innodb.com/ibman.php for installation help.\n", - err); + " InnoDB: Operating system error number %lu in a file operation.\n", (ulong) err); if (err == ERROR_PATH_NOT_FOUND) { - fprintf(stderr, - "InnoDB: The error means the system cannot find the path specified.\n" - "InnoDB: In installation you must create directories yourself, InnoDB\n" - "InnoDB: does not create them.\n"); + fprintf(stderr, + "InnoDB: The error means the system cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB, remember that you must create\n" + "InnoDB: directories yourself, InnoDB does not create them.\n"); + } } else if (err == ERROR_ACCESS_DENIED) { - fprintf(stderr, + fprintf(stderr, "InnoDB: The error means mysqld does not have the access rights to\n" "InnoDB: the directory. It may also be you have created a subdirectory\n" "InnoDB: of the same name as a data file.\n"); } else { - fprintf(stderr, + fprintf(stderr, "InnoDB: See section 13.2 at http://www.innodb.com/ibman.php\n" "InnoDB: about operating system error numbers.\n"); } @@ -246,30 +253,33 @@ os_file_get_last_error(void) #else err = (ulint) errno; - if (err != ENOSPC && err != EEXIST) { - ut_print_timestamp(stderr); + if (report_all_errors + || (err != ENOSPC && err != EEXIST)) { + ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %lu in a file operation.\n" - "InnoDB: See http://www.innodb.com/ibman.php for installation help.\n", - err); + " InnoDB: Operating system error number %lu in a file operation.\n", (ulong) err); if (err == ENOENT) { - fprintf(stderr, - "InnoDB: The error means the system cannot find the path specified.\n" - "InnoDB: In installation you must create directories yourself, InnoDB\n" - "InnoDB: does not create them.\n"); + fprintf(stderr, + "InnoDB: The error means the system cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB, remember that you must create\n" + "InnoDB: directories yourself, InnoDB does not create them.\n"); + } } else if (err == EACCES) { - fprintf(stderr, + fprintf(stderr, "InnoDB: The error means mysqld does not have the access rights to\n" "InnoDB: the directory.\n"); } else { - if (strerror((int)err) != NULL) { + if (strerror((int)err) != NULL) { fprintf(stderr, "InnoDB: Error number %lu means '%s'.\n", err, strerror((int)err)); - } + } - fprintf(stderr, + fprintf(stderr, "InnoDB: See also section 13.2 at http://www.innodb.com/ibman.php\n" "InnoDB: about operating system error numbers.\n"); } @@ -306,7 +316,7 @@ os_file_handle_error( { ulint err; - err = os_file_get_last_error(); + err = os_file_get_last_error(FALSE); if (err == OS_FILE_DISK_FULL) { /* We only print a warning about disk full once */ @@ -333,6 +343,7 @@ os_file_handle_error( return(FALSE); } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) { + return(TRUE); } else if (err == OS_FILE_ALREADY_EXISTS) { @@ -355,6 +366,102 @@ os_file_handle_error( return(FALSE); } +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__FreeBSD__) || defined(__NETWARE__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + * On FreeBSD with LinuxThreads, advisory locking does not work properly. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/******************************************************************** +Obtain an exclusive lock on a file. */ +static +int +os_file_lock( +/*=========*/ + /* out: 0 on success */ + int fd, /* in: file descriptor */ + const char* name) /* in: file name */ +{ + struct flock lk; + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + if (fcntl(fd, F_SETLK, &lk) == -1) { + fprintf(stderr, + "InnoDB: Unable to lock %s, error: %d", name, errno); + close(fd); + return(-1); + } + return(0); +} +#endif /* USE_FILE_LOCK */ + +/******************************************************************** +Does error handling when a file operation fails. */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + /* out: TRUE if we should retry the + operation */ + os_file_t file, /* in: file pointer */ + const char* name, /* in: name of a file or NULL */ + const char* operation)/* in: operation */ +{ + ulint err; + + UT_NOT_USED(file); + + err = os_file_get_last_error(FALSE); + + if (err == OS_FILE_DISK_FULL) { + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk to free space.\n"); + + os_has_said_disk_full = TRUE; + + fflush(stderr); + + return(FALSE); + + } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) { + + return(TRUE); + + } else if (err == OS_FILE_ALREADY_EXISTS) { + + return(FALSE); + } else { + if (name) { + fprintf(stderr, "InnoDB: File name %s\n", name); + } + + fprintf(stderr, "InnoDB: File operation call: '%s'.\n", + operation); + return (FALSE); + } + + return(FALSE); /* not reached */ +} + /******************************************************************** Creates the seek mutexes used in positioned reads and writes. */ @@ -390,21 +497,285 @@ os_file_create_tmpfile(void) return(file); } +/*************************************************************************** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. */ + +os_file_dir_t +os_file_opendir( +/*============*/ + /* out: directory stream, NULL if + error */ + const char* dirname, /* in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /* in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); + + dir = FindFirstFile(path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(dir); +#endif +} + +/*************************************************************************** +Closes a directory stream. */ + +int +os_file_closedir( +/*=============*/ + /* out: 0 if success, -1 if failure */ + os_file_dir_t dir) /* in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, NULL, "closedir"); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(0, NULL, "closedir"); + } + + return(ret); +#endif +} + +/*************************************************************************** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. */ + +int +os_file_readdir_next_file( +/*======================*/ + /* out: 0 if ok, -1 if error, 1 if at the end + of the directory */ + const char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info) /* in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen(lpFindFileData->cFileName) < OS_FILE_MAX_PATH); + + if (strcmp(lpFindFileData->cFileName, ".") == 0 + || strcmp(lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, lpFindFileData->cFileName); + + info->size = (ib_longlong)(lpFindFileData->nFileSizeLow) + + (((ib_longlong)(lpFindFileData->nFileSizeHigh)) << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { +/* TODO: test Windows symlinks */ +/* TODO: MySQL has apparently its own symlink implementation in Windows, +dbname.sym can redirect a database directory: +http://www.mysql.com/doc/en/Windows_symbolic_links.html */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_NORMAL) { +/* TODO: are FILE_ATTRIBUTE_NORMAL files really all normal files? */ + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(NULL, dirname, + "readdir_next_file"); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +next_file: + ent = readdir(dir); + + if (ent == NULL) { + return(1); + } + + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + os_file_handle_error_no_exit(0, full_path, "stat"); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_longlong)statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/********************************************************************* +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. */ + +ibool +os_file_create_directory( +/*=====================*/ + /* out: TRUE if call succeeds, + FALSE on error */ + const char* pathname, /* in: directory name as + null-terminated string */ + ibool fail_if_exists) /* in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory(pathname, NULL); + if (!(rcode != 0 || + (GetLastError() == ERROR_FILE_EXISTS && !fail_if_exists))) { + /* failure */ + os_file_handle_error(pathname, "CreateDirectory"); + + return(FALSE); + } + + return (TRUE); +#else + int rcode; + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error(pathname, "mkdir"); + + return(FALSE); + } + + return (TRUE); +#endif +} + /******************************************************************** A simple function to open or create a file. */ os_file_t os_file_create_simple( /*==================*/ - /* out, own: handle to the file, not defined if error, - error number can be retrieved with os_get_last_error */ - char* name, /* in: name of the file or path as a null-terminated - string */ - ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened - (if does not exist, error), or OS_FILE_CREATE if a new - file is created (if exists, error) */ - ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success)/* out: TRUE if succeed, FALSE if error */ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is + opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), or + OS_FILE_CREATE_PATH if new file + (if exists, error) and subdirectories along + its path are created (if needed)*/ + ulint access_type,/* in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ { #ifdef __WIN__ os_file_t file; @@ -420,6 +791,14 @@ try_again: create_flag = OPEN_EXISTING; } else if (create_mode == OS_FILE_CREATE) { create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_CREATE_PATH) { + /* create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { + ut_error; + } + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; } else { create_flag = 0; ut_error; @@ -473,6 +852,14 @@ try_again: } } else if (create_mode == OS_FILE_CREATE) { create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_CREATE_PATH) { + /* create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { + return (-1); + } + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; } else { create_flag = 0; ut_error; @@ -494,6 +881,12 @@ try_again: if (retry) { goto try_again; } +#ifdef USE_FILE_LOCK + } else if (access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; + file = -1; +#endif } else { *success = TRUE; } @@ -508,21 +901,27 @@ A simple function to open or create a file. */ os_file_t os_file_create_simple_no_error_handling( /*====================================*/ - /* out, own: handle to the file, not defined if error, - error number can be retrieved with os_get_last_error */ - char* name, /* in: name of the file or path as a null-terminated - string */ - ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened - (if does not exist, error), or OS_FILE_CREATE if a new - file is created (if exists, error) */ - ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success)/* out: TRUE if succeed, FALSE if error */ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ { #ifdef __WIN__ os_file_t file; DWORD create_flag; DWORD access; DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ; ut_a(name); @@ -539,6 +938,13 @@ os_file_create_simple_no_error_handling( access = GENERIC_READ; } else if (access_type == OS_FILE_READ_WRITE) { access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + access = GENERIC_READ; + share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ + | FILE_SHARE_WRITE; /* A backup program has to give + mysqld the maximum freedom to + do what it likes with the + file */ } else { access = 0; ut_error; @@ -546,8 +952,7 @@ os_file_create_simple_no_error_handling( file = CreateFile(name, access, - FILE_SHARE_READ,/* file can be read also by other - processes */ + share_mode, NULL, /* default security attributes */ create_flag, attributes, @@ -588,6 +993,12 @@ os_file_create_simple_no_error_handling( if (file == -1) { *success = FALSE; +#ifdef USE_FILE_LOCK + } else if (access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; + file = -1; +#endif } else { *success = TRUE; } @@ -602,33 +1013,42 @@ Opens an existing file or creates a new. */ os_file_t os_file_create( /*===========*/ - /* out, own: handle to the file, not defined if error, - error number can be retrieved with os_get_last_error */ - char* name, /* in: name of the file or path as a null-terminated - string */ - ulint create_mode, /* in: OS_FILE_OPEN if an existing file is opened - (if does not exist, error), or OS_FILE_CREATE if a new - file is created (if exists, error), OS_FILE_OVERWRITE - if a new is created or an old overwritten */ - ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o - is desired, OS_FILE_NORMAL, if any normal file; - NOTE that it also depends on type, os_aio_.. and srv_.. - variables whether we really use async i/o or - unbuffered i/o: look in the function source code for - the exact rules */ - ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/* out: TRUE if succeed, FALSE if error */ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), + OS_FILE_OVERWRITE if a new file is created + or an old overwritten; + OS_FILE_OPEN_RAW, if a raw device or disk + partition should be opened */ + ulint purpose,/* in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ { #ifdef __WIN__ os_file_t file; + DWORD share_mode = FILE_SHARE_READ; DWORD create_flag; DWORD attributes; ibool retry; - try_again: ut_a(name); - if (create_mode == OS_FILE_OPEN) { + if (create_mode == OS_FILE_OPEN_RAW) { + create_flag = OPEN_EXISTING; + share_mode = FILE_SHARE_WRITE; + } else if (create_mode == OS_FILE_OPEN) { create_flag = OPEN_EXISTING; } else if (create_mode == OS_FILE_CREATE) { create_flag = CREATE_NEW; @@ -678,14 +1098,17 @@ try_again: file = CreateFile(name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ - FILE_SHARE_READ,/* File can be read also by other + share_mode, /* File can be read also by other processes; we must give the read permission because of ibbackup. We do not give the write permission to others because if one would succeed to start 2 instances of mysqld on the SAME files, that could cause severe - database corruption! */ + database corruption! When opening + raw disk partitions, Microsoft manuals + say that we must give also the write + permission. */ NULL, /* default security attributes */ create_flag, attributes, @@ -695,8 +1118,8 @@ try_again: *success = FALSE; retry = os_file_handle_error(name, - create_mode == OS_FILE_OPEN ? - "open" : "create"); + create_mode == OS_FILE_CREATE ? + "create" : "open"); if (retry) { goto try_again; } @@ -716,17 +1139,14 @@ try_again: try_again: ut_a(name); - if (create_mode == OS_FILE_OPEN) { + if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW) { mode_str = "OPEN"; - create_flag = O_RDWR; } else if (create_mode == OS_FILE_CREATE) { mode_str = "CREATE"; - create_flag = O_RDWR | O_CREAT | O_EXCL; } else if (create_mode == OS_FILE_OVERWRITE) { mode_str = "OVERWRITE"; - create_flag = O_RDWR | O_CREAT | O_TRUNC; } else { create_flag = 0; @@ -783,11 +1203,17 @@ try_again: *success = FALSE; retry = os_file_handle_error(name, - create_mode == OS_FILE_OPEN ? - "open" : "create"); + create_mode == OS_FILE_CREATE ? + "create" : "open"); if (retry) { goto try_again; } +#ifdef USE_FILE_LOCK + } else if (create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file, name)) { + *success = FALSE; + file = -1; +#endif } else { *success = TRUE; } @@ -797,6 +1223,168 @@ try_again: } /*************************************************************************** +Deletes a file if it exists. The file has to be closed before calling this. */ + +ibool +os_file_delete_if_exists( +/*=====================*/ + /* out: TRUE if success */ + const char* name) /* in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + if (GetLastError() == ERROR_PATH_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(TRUE); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + fprintf(stderr, +"InnoDB: Warning: cannot delete file %s\n" +"InnoDB: Are you running ibbackup to back up the file?\n", name); + + os_file_get_last_error(TRUE); /* print error information */ + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(FALSE); + } + + goto loop; +#else + int ret; + + ret = unlink((const char*)name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error(name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Deletes a file. The file has to be closed before calling this. */ + +ibool +os_file_delete( +/*===========*/ + /* out: TRUE if success */ + const char* name) /* in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + if (GetLastError() == ERROR_PATH_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(FALSE); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + fprintf(stderr, +"InnoDB: Warning: cannot delete file %s\n" +"InnoDB: Are you running ibbackup to back up the file?\n", name); + + os_file_get_last_error(TRUE); /* print error information */ + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(FALSE); + } + + goto loop; +#else + int ret; + + ret = unlink((const char*)name); + + if (ret != 0) { + os_file_handle_error(name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. */ + +ibool +os_file_rename( +/*===========*/ + /* out: TRUE if success */ + const char* oldpath,/* in: old file path as a null-terminated + string */ + const char* newpath)/* in: new file path */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(oldpath, "rename"); + + return(FALSE); +#else + int ret; + + ret = rename((const char*)oldpath, (const char*)newpath); + + if (ret != 0) { + os_file_handle_error(oldpath, "rename"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** Closes a file handle. In case of error, error number can be retrieved with os_file_get_last_error. */ @@ -818,6 +1406,7 @@ os_file_close( } os_file_handle_error(NULL, "close"); + return(FALSE); #else int ret; @@ -826,6 +1415,7 @@ os_file_close( if (ret == -1) { os_file_handle_error(NULL, "close"); + return(FALSE); } @@ -905,7 +1495,7 @@ os_file_get_size( } if (sizeof(off_t) > 4) { - *size = (ulint)(offs & 0xFFFFFFFF); + *size = (ulint)(offs & 0xFFFFFFFFUL); *size_high = (ulint)(offs >> 32); } else { *size = (ulint) offs; @@ -917,13 +1507,36 @@ os_file_get_size( } /*************************************************************************** +Gets file size as a 64-bit integer ib_longlong. */ + +ib_longlong +os_file_get_size_as_iblonglong( +/*===========================*/ + /* out: size in bytes, -1 if error */ + os_file_t file) /* in: handle to a file */ +{ + ulint size; + ulint size_high; + ibool success; + + success = os_file_get_size(file, &size, &size_high); + + if (!success) { + + return(-1); + } + + return((((ib_longlong)size_high) << 32) + (ib_longlong)size); +} + +/*************************************************************************** Sets a file size. This function can be used to extend or truncate a file. */ ibool os_file_set_size( /*=============*/ /* out: TRUE if success */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ os_file_t file, /* in: handle to a file */ ulint size, /* in: least significant 32 bits of file @@ -982,7 +1595,7 @@ os_file_set_size( != offset / (ib_longlong)(100 * 1024 * 1024)) { fprintf(stderr, " %lu00", - (ulint)((offset + n_bytes) + (ulong) ((offset + n_bytes) / (ib_longlong)(100 * 1024 * 1024))); } @@ -1045,6 +1658,15 @@ os_file_flush( return(TRUE); } + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + os_file_handle_error(NULL, "flush"); /* It is a fatal error if a file flush does not succeed, because then @@ -1068,9 +1690,10 @@ os_file_flush( } /* Since Linux returns EINVAL if the 'file' is actually a raw device, - we choose to ignore that error */ + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { - if (errno == EINVAL) { return(TRUE); } @@ -1108,7 +1731,7 @@ os_file_pread( off_t offs; ssize_t n_bytes; - ut_a((offset & 0xFFFFFFFF) == offset); + ut_a((offset & 0xFFFFFFFFUL) == offset); /* If off_t is > 4 bytes in size, then we assume we can pass a 64-bit address */ @@ -1174,7 +1797,7 @@ os_file_pwrite( /*===========*/ /* out: number of bytes written, -1 if error */ os_file_t file, /* in: handle to a file */ - void* buf, /* in: buffer from where to write */ + const void* buf, /* in: buffer from where to write */ ulint n, /* in: number of bytes to write */ ulint offset, /* in: least significant 32 bits of file offset where to write */ @@ -1184,7 +1807,7 @@ os_file_pwrite( ssize_t ret; off_t offs; - ut_a((offset & 0xFFFFFFFF) == offset); + ut_a((offset & 0xFFFFFFFFUL) == offset); /* If off_t is > 4 bytes in size, then we assume we can pass a 64-bit address */ @@ -1288,7 +1911,7 @@ os_file_read( ibool retry; ulint i; - ut_a((offset & 0xFFFFFFFF) == offset); + ut_a((offset & 0xFFFFFFFFUL) == offset); os_n_file_reads++; os_bytes_read_since_printout += n; @@ -1348,9 +1971,9 @@ error_handling: fprintf(stderr, "InnoDB: Fatal error: cannot read from file. OS error number %lu.\n", #ifdef __WIN__ - (ulint)GetLastError() + (ulong) GetLastError() #else - (ulint)errno + (ulong) errno #endif ); fflush(stderr); @@ -1361,6 +1984,92 @@ error_handling: } /*********************************************************************** +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. */ + +ibool +os_file_read_no_error_handling( +/*===========================*/ + /* out: TRUE if request was + successful, FALSE if fail */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint offset, /* in: least significant 32 bits of file + offset where to read */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint n) /* in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; + ulint i; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = offset; + high = offset_high; + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + goto error_handling; + } + + ret = ReadFile(file, buf, n, &len, NULL); + + os_mutex_exit(os_file_seek_mutexes[i]); + + if (ret && len == n) { + return(TRUE); + } +#else + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } +#endif +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error_no_exit(file, NULL, "read"); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*********************************************************************** Requests a synchronous write operation. */ ibool @@ -1368,10 +2077,10 @@ os_file_write( /*==========*/ /* out: TRUE if request was successful, FALSE if fail */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ os_file_t file, /* in: handle to a file */ - void* buf, /* in: buffer from which to write */ + const void* buf, /* in: buffer from which to write */ ulint offset, /* in: least significant 32 bits of file offset where to write */ ulint offset_high, /* in: most significant 32 bits of @@ -1417,8 +2126,8 @@ retry: "InnoDB: offset %lu %lu. Operating system error number %lu.\n" "InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.php\n" "InnoDB: what the error number means.\n", - name, offset_high, offset, - (ulint)GetLastError()); + name, (ulong) offset_high, (ulong) offset, + (ulong) GetLastError()); return(FALSE); } @@ -1464,12 +2173,12 @@ retry: "InnoDB: Operating system error number %lu.\n" "InnoDB: Check that your OS and file system support files of this size.\n" "InnoDB: Check also that the disk is not full or a disk quota exceeded.\n", - name, offset_high, offset, n, (ulint)len, - err); + name, (ulong) offset_high, (ulong) offset, + (ulong) n, (ulong) len, (ulong) err); if (strerror((int)err) != NULL) { fprintf(stderr, -"InnoDB: Error number %lu means '%s'.\n", err, strerror((int)err)); +"InnoDB: Error number %lu means '%s'.\n", (ulong) err, strerror((int)err)); } fprintf(stderr, @@ -1518,6 +2227,182 @@ retry: #endif } +/*********************************************************************** +Check the existence and type of the given file. */ + +ibool +os_file_status( +/*===========*/ + /* out: TRUE if call succeeded */ + const char* path, /* in: pathname of the file */ + ibool* exists, /* out: TRUE if file exists */ + os_file_type_t* type) /* out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat statinfo; + + ret = _stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(0, path, "stat"); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(0, path, "stat"); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/******************************************************************** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' charac +ters are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." +*/ + +char* +os_file_dirname( +/*============*/ + /* out, own: directory component of the + pathname */ + const char* path) /* in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/******************************************************************** +Creates all missing subdirectories along the given path. */ + +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + /* out: TRUE if call succeeded + FALSE otherwise */ + const char* path) /* in: path name */ +{ + char* subdir; + ibool success, subdir_exists; + os_file_type_t type; + + subdir = os_file_dirname(path); + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + success = os_file_status(subdir, &subdir_exists, &type); + if (success && !subdir_exists) { + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + if (!success) { + mem_free(subdir); + + return(FALSE); + } + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + /******************************************************************** Returns a pointer to the nth slot in the aio array. */ static @@ -1885,7 +2770,7 @@ os_aio_array_reserve_slot( void* message2,/* in: message to be passed along with the aio operation */ os_file_t file, /* in: file handle */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ void* buf, /* in: buffer where to read or from which to write */ @@ -2132,7 +3017,7 @@ os_aio( because i/os are not actually handled until all have been posted: use with great caution! */ - char* name, /* in: name of the file or path as a + const char* name, /* in: name of the file or path as a null-terminated string */ os_file_t file, /* in: handle to a file */ void* buf, /* in: buffer where to read or from which @@ -2519,7 +3404,7 @@ os_aio_simulated_handle( ulint biggest_age; ulint age; byte* combined_buf; - byte* combined_buf2= 0; /* Remove warning */ + byte* combined_buf2; ibool ret; ulint n; ulint i; @@ -2559,7 +3444,7 @@ restart: if (os_aio_print_debug) { fprintf(stderr, -"InnoDB: i/o for slot %lu already done, returning\n", i); +"InnoDB: i/o for slot %lu already done, returning\n", (ulong) i); } ret = TRUE; @@ -2675,6 +3560,7 @@ consecutive_loop: if (n_consecutive == 1) { /* We can use the buffer of the i/o request */ combined_buf = slot->buf; + combined_buf2 = NULL; } else { combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); @@ -2706,8 +3592,8 @@ consecutive_loop: if (os_aio_print_debug) { fprintf(stderr, "InnoDB: doing i/o of type %lu at offset %lu %lu, length %lu\n", - slot->type, slot->offset_high, slot->offset, - total_len); + (ulong) slot->type, (ulong) slot->offset_high, + (ulong) slot->offset, (ulong) total_len); } /* Do the i/o with ordinary, synchronous i/o functions: */ @@ -2717,8 +3603,9 @@ consecutive_loop: || (slot->offset % UNIV_PAGE_SIZE != 0)) { fprintf(stderr, "InnoDB: Error: trying a displaced write to %s %lu %lu, len %lu\n", - slot->name, slot->offset_high, - slot->offset, total_len); + slot->name, (ulong) slot->offset_high, + (ulong) slot->offset, + (ulong) total_len); ut_error; } @@ -2769,7 +3656,7 @@ consecutive_loop: } } - if (n_consecutive > 1) { + if (combined_buf2) { ut_free(combined_buf2); } @@ -2816,7 +3703,7 @@ recommended_sleep: if (os_aio_print_debug) { fprintf(stderr, "InnoDB: i/o handler thread for i/o segment %lu wakes up\n", - global_segment); + (ulong) global_segment); } goto restart; @@ -2892,7 +3779,7 @@ os_aio_print( ulint i; for (i = 0; i < srv_n_file_io_threads; i++) { - fprintf(file, "I/O thread %lu state: %s (%s)\n", i, + fprintf(file, "I/O thread %lu state: %s (%s)\n", (ulong) i, srv_io_thread_op_info[i], srv_io_thread_function[i]); } @@ -2923,7 +3810,7 @@ loop: ut_a(array->n_reserved == n_reserved); - fprintf(file, " %lu", n_reserved); + fprintf(file, " %lu", (ulong) n_reserved); os_mutex_exit(array->mutex); @@ -2963,13 +3850,16 @@ loop: fprintf(file, "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", - fil_n_pending_log_flushes, fil_n_pending_tablespace_flushes, - os_n_file_reads, os_n_file_writes, os_n_fsyncs); + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { fprintf(file, "%lu pending preads, %lu pending pwrites\n", - os_file_n_pending_preads, os_file_n_pending_pwrites); + (ulong) os_file_n_pending_preads, + (ulong) os_file_n_pending_pwrites); } if (os_n_file_reads == os_n_file_reads_old) { @@ -2983,7 +3873,7 @@ loop: "%.2f reads/s, %lu avg bytes/read, %.2f writes/s, %.2f fsyncs/s\n", (os_n_file_reads - os_n_file_reads_old) / time_elapsed, - (ulint)avg_bytes_read, + (ulong)avg_bytes_read, (os_n_file_writes - os_n_file_writes_old) / time_elapsed, (os_n_fsyncs - os_n_fsyncs_old) diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c index 87a0bfb9e92..2f155788420 100644 --- a/innobase/os/os0proc.c +++ b/innobase/os/os0proc.c @@ -12,11 +12,469 @@ Created 9/30/1995 Heikki Tuuri #include "os0proc.ic" #endif +#include "ut0mem.h" +#include "ut0byte.h" + + +/* +How to get AWE to compile on Windows? +------------------------------------- + +In the project settings of the innobase project the Visual C++ source, +__WIN2000__ has to be defined. + +The Visual C++ has to be relatively recent and _WIN32_WINNT has to be +defined to a value >= 0x0500 when windows.h is included. + +#define _WIN32_WINNT 0x0500 + +Where does AWE work? +------------------- + +See the error message in os_awe_allocate_physical_mem(). + +How to assign privileges for mysqld to use AWE? +----------------------------------------------- + +See the error message in os_awe_enable_lock_pages_in_mem(). + +Use Windows AWE functions in this order +--------------------------------------- + +(1) os_awe_enable_lock_pages_in_mem(); +(2) os_awe_allocate_physical_mem(); +(3) os_awe_allocate_virtual_mem_window(); +(4) os_awe_map_physical_mem_to_window(). + +To test 'AWE' in a computer which does not have the AWE API, +you can compile with UNIV_SIMULATE_AWE defined in this file. +*/ + +#ifdef UNIV_SIMULATE_AWE +/* If we simulate AWE, we allocate the 'physical memory' here */ +byte* os_awe_simulate_mem; +ulint os_awe_simulate_mem_size; +os_awe_t* os_awe_simulate_page_info; +byte* os_awe_simulate_window; +ulint os_awe_simulate_window_size; +/* In simulated AWE the following contains a NULL pointer or a pointer +to a mapped 'physical page' for each 4 kB page in the AWE window */ +byte** os_awe_simulate_map; +#endif + +#ifdef __WIN2000__ +os_awe_t* os_awe_page_info; +ulint os_awe_n_pages; +byte* os_awe_window; +ulint os_awe_window_size; +#endif + +/******************************************************************** +Windows AWE support. Tries to enable the "lock pages in memory" privilege for +the current process so that the current process can allocate memory-locked +virtual address space to act as the window where AWE maps physical memory. */ + +ibool +os_awe_enable_lock_pages_in_mem(void) +/*=================================*/ + /* out: TRUE if success, FALSE if error; + prints error info to stderr if no success */ +{ +#ifdef UNIV_SIMULATE_AWE + + return(TRUE); + +#elif defined(__WIN2000__) + struct { + DWORD Count; + LUID_AND_ATTRIBUTES Privilege[1]; + } Info; + HANDLE hProcess; + HANDLE Token; + BOOL Result; + + hProcess = GetCurrentProcess(); + + /* Open the token of the current process */ + + Result = OpenProcessToken(hProcess, + TOKEN_ADJUST_PRIVILEGES, + &Token); + if (Result != TRUE) { + fprintf(stderr, + "InnoDB: AWE: Cannot open process token, error %lu\n", + (ulint)GetLastError()); + return(FALSE); + } + + Info.Count = 1; + + Info.Privilege[0].Attributes = SE_PRIVILEGE_ENABLED; + + /* Get the local unique identifier (LUID) of the SE_LOCK_MEMORY + privilege */ + + Result = LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, + &(Info.Privilege[0].Luid)); + if (Result != TRUE) { + fprintf(stderr, + "InnoDB: AWE: Cannot get local privilege value for %s, error %lu.\n", + SE_LOCK_MEMORY_NAME, (ulint)GetLastError()); + + return(FALSE); + } + + /* Try to adjust the privilege */ + + Result = AdjustTokenPrivileges(Token, FALSE, + (PTOKEN_PRIVILEGES)&Info, + 0, NULL, NULL); + /* Check the result */ + + if (Result != TRUE) { + fprintf(stderr, + "InnoDB: AWE: Cannot adjust process token privileges, error %u.\n", + GetLastError()); + return(FALSE); + } else if (GetLastError() != ERROR_SUCCESS) { + fprintf(stderr, +"InnoDB: AWE: Cannot enable SE_LOCK_MEMORY privilege, error %lu.\n" +"InnoDB: In Windows XP Home you cannot use AWE. In Windows 2000 and XP\n" +"InnoDB: Professional you must go to the Control Panel, to\n" +"InnoDB: Security Settings, to Local Policies, and enable\n" +"InnoDB: the 'lock pages in memory' privilege for the user who runs\n" +"InnoDB: the MySQL server.\n", GetLastError()); + + return(FALSE); + } + + CloseHandle(Token); + + return(TRUE); +#else #ifdef __WIN__ -#include <windows.h> + fprintf(stderr, +"InnoDB: AWE: Error: to use AWE you must use a ...-nt MySQL executable.\n"); +#endif + return(FALSE); #endif +} -#include "ut0mem.h" +/******************************************************************** +Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86 +processor. */ + +ibool +os_awe_allocate_physical_mem( +/*=========================*/ + /* out: TRUE if success */ + os_awe_t** page_info, /* out, own: array of opaque data containing + the info for allocated physical memory pages; + each allocated 4 kB physical memory page has + one slot of type os_awe_t in the array */ + ulint n_megabytes) /* in: number of megabytes to allocate */ +{ +#ifdef UNIV_SIMULATE_AWE + os_awe_simulate_page_info = ut_malloc(sizeof(os_awe_t) * + n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE)); + + os_awe_simulate_mem = ut_align(ut_malloc( + 4096 + 1024 * 1024 * n_megabytes), + 4096); + os_awe_simulate_mem_size = n_megabytes * 1024 * 1024; + + *page_info = os_awe_simulate_page_info; + + return(TRUE); + +#elif defined(__WIN2000__) + BOOL bResult; + os_awe_t NumberOfPages; /* Question: why does Windows + use the name ULONG_PTR for + a scalar integer type? Maybe + because we may also refer to + &NumberOfPages? */ + os_awe_t NumberOfPagesInitial; + SYSTEM_INFO sSysInfo; + int PFNArraySize; + + if (n_megabytes > 64 * 1024) { + + fprintf(stderr, +"InnoDB: AWE: Error: tried to allocate %lu MB.\n" +"InnoDB: AWE cannot allocate more than 64 GB in any computer.\n", n_megabytes); + + return(FALSE); + } + + GetSystemInfo(&sSysInfo); /* fill the system information structure */ + + if ((ulint)OS_AWE_X86_PAGE_SIZE != (ulint)sSysInfo.dwPageSize) { + fprintf(stderr, +"InnoDB: AWE: Error: this computer has a page size of %lu.\n" +"InnoDB: Should be 4096 bytes for InnoDB AWE support to work.\n", + (ulint)sSysInfo.dwPageSize); + + return(FALSE); + } + + /* Calculate the number of pages of memory to request */ + + NumberOfPages = n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE); + + /* Calculate the size of page_info for allocated physical pages */ + + PFNArraySize = NumberOfPages * sizeof(os_awe_t); + + *page_info = (os_awe_t*)HeapAlloc(GetProcessHeap(), 0, PFNArraySize); + + if (*page_info == NULL) { + fprintf(stderr, +"InnoDB: AWE: Failed to allocate page info array from process heap, error %lu\n", + (ulint)GetLastError()); + + return(FALSE); + } + + ut_total_allocated_memory += PFNArraySize; + + /* Enable this process' privilege to lock pages to physical memory */ + + if (!os_awe_enable_lock_pages_in_mem()) { + + return(FALSE); + } + + /* Allocate the physical memory */ + + NumberOfPagesInitial = NumberOfPages; + + os_awe_page_info = *page_info; + os_awe_n_pages = (ulint)NumberOfPages; + + /* Compilation note: if the compiler complains the function is not + defined, see the note at the start of this file */ + + bResult = AllocateUserPhysicalPages(GetCurrentProcess(), + &NumberOfPages, + *page_info); + if (bResult != TRUE) { + fprintf(stderr, +"InnoDB: AWE: Cannot allocate physical pages, error %lu.\n", + (ulint)GetLastError()); + + return(FALSE); + } + + if (NumberOfPagesInitial != NumberOfPages) { + fprintf(stderr, +"InnoDB: AWE: Error: allocated only %lu pages of %lu requested.\n" +"InnoDB: Check that you have enough free RAM.\n" +"InnoDB: In Windows XP Professional and 2000 Professional\n" +"InnoDB: Windows PAE size is max 4 GB. In 2000 and .NET\n" +"InnoDB: Advanced Servers and 2000 Datacenter Server it is 32 GB,\n" +"InnoDB: and in .NET Datacenter Server it is 64 GB.\n" +"InnoDB: A Microsoft web page said that the processor must be an Intel\n" +"InnoDB: processor.\n", + (ulint)NumberOfPages, + (ulint)NumberOfPagesInitial); + + return(FALSE); + } + + fprintf(stderr, +"InnoDB: Using Address Windowing Extensions (AWE); allocated %lu MB\n", + n_megabytes); + + return(TRUE); +#else + return(FALSE); +#endif +} + +/******************************************************************** +Allocates a window in the virtual address space where we can map then +pages of physical memory. */ + +byte* +os_awe_allocate_virtual_mem_window( +/*===============================*/ + /* out, own: allocated memory, or NULL if did not + succeed */ + ulint size) /* in: virtual memory allocation size in bytes, must + be < 2 GB */ +{ +#ifdef UNIV_SIMULATE_AWE + ulint i; + + os_awe_simulate_window = ut_align(ut_malloc(4096 + size), 4096); + os_awe_simulate_window_size = size; + + os_awe_simulate_map = ut_malloc(sizeof(byte*) * (size / 4096)); + + for (i = 0; i < (size / 4096); i++) { + *(os_awe_simulate_map + i) = NULL; + } + + return(os_awe_simulate_window); + +#elif defined(__WIN2000__) + byte* ptr; + + if (size > (ulint)0x7FFFFFFFUL) { + fprintf(stderr, +"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory\n", size); + + return(NULL); + } + + ptr = VirtualAlloc(NULL, (SIZE_T)size, MEM_RESERVE | MEM_PHYSICAL, + PAGE_READWRITE); + if (ptr == NULL) { + fprintf(stderr, +"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory, error %lu\n", + size, (ulint)GetLastError()); + + return(NULL); + } + + os_awe_window = ptr; + os_awe_window_size = size; + + ut_total_allocated_memory += size; + + return(ptr); +#else + return(NULL); +#endif +} + +/******************************************************************** +With this function you can map parts of physical memory allocated with +the ..._allocate_physical_mem to the virtual address space allocated with +the previous function. Intel implements this so that the process page +tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP +showed that this takes < 1 microsecond, much better than the estimated 80 us +for copying a 16 kB page memory to memory. But, the operation will at least +partially invalidate the translation lookaside buffer (TLB) of all +processors. Under a real-world load the performance hit may be bigger. */ + +ibool +os_awe_map_physical_mem_to_window( +/*==============================*/ + /* out: TRUE if success; the function + calls exit(1) in case of an error */ + byte* ptr, /* in: a page-aligned pointer to + somewhere in the virtual address + space window; we map the physical mem + pages here */ + ulint n_mem_pages, /* in: number of 4 kB mem pages to + map */ + os_awe_t* page_info) /* in: array of page infos for those + pages; each page has one slot in the + array */ +{ +#ifdef UNIV_SIMULATE_AWE + ulint i; + byte** map; + byte* page; + byte* phys_page; + + ut_a(ptr >= os_awe_simulate_window); + ut_a(ptr < os_awe_simulate_window + os_awe_simulate_window_size); + ut_a(page_info >= os_awe_simulate_page_info); + ut_a(page_info < os_awe_simulate_page_info + + (os_awe_simulate_mem_size / 4096)); + + /* First look if some other 'physical pages' are mapped at ptr, + and copy them back to where they were if yes */ + + map = os_awe_simulate_map + + ((ulint)(ptr - os_awe_simulate_window)) / 4096; + page = ptr; + + for (i = 0; i < n_mem_pages; i++) { + if (*map != NULL) { + ut_memcpy(*map, page, 4096); + } + map++; + page += 4096; + } + + /* Then copy to ptr the 'physical pages' determined by page_info; we + assume page_info is a segment of the array we created at the start */ + + phys_page = os_awe_simulate_mem + + (ulint)(page_info - os_awe_simulate_page_info) + * 4096; + + ut_memcpy(ptr, phys_page, n_mem_pages * 4096); + + /* Update the map */ + + map = os_awe_simulate_map + + ((ulint)(ptr - os_awe_simulate_window)) / 4096; + + for (i = 0; i < n_mem_pages; i++) { + *map = phys_page; + + map++; + phys_page += 4096; + } + + return(TRUE); + +#elif defined(__WIN2000__) + BOOL bResult; + os_awe_t n_pages; + + n_pages = (os_awe_t)n_mem_pages; + + if (!(ptr >= os_awe_window)) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map to address %lx but AWE window start %lx\n", + (ulint)ptr, (ulint)os_awe_window); + ut_a(0); + } + + if (!(ptr <= os_awe_window + os_awe_window_size - UNIV_PAGE_SIZE)) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map to address %lx but AWE window end %lx\n", + (ulint)ptr, (ulint)os_awe_window + os_awe_window_size); + ut_a(0); + } + + if (!(page_info >= os_awe_page_info)) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map page info at %lx but array start %lx\n", + (ulint)page_info, (ulint)os_awe_page_info); + ut_a(0); + } + + if (!(page_info <= os_awe_page_info + (os_awe_n_pages - 4))) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map page info at %lx but array end %lx\n", + (ulint)page_info, (ulint)(os_awe_page_info + os_awe_n_pages)); + ut_a(0); + } + + bResult = MapUserPhysicalPages((PVOID)ptr, n_pages, page_info); + + if (bResult != TRUE) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: AWE: Mapping of %lu physical pages to address %lx failed,\n" +"InnoDB: error %lu.\n" +"InnoDB: Cannot continue operation.\n", + n_mem_pages, (ulint)ptr, (ulint)GetLastError()); + exit(1); + } + + return(TRUE); +#else + return(FALSE); +#endif +} /******************************************************************** Converts the current process id to a number. It is not guaranteed that the diff --git a/innobase/os/os0sync.c b/innobase/os/os0sync.c index 827d68501db..c48c44a4c70 100644 --- a/innobase/os/os0sync.c +++ b/innobase/os/os0sync.c @@ -109,9 +109,9 @@ must be reset explicitly by calling sync_os_reset_event. */ os_event_t os_event_create( /*============*/ - /* out: the event handle */ - char* name) /* in: the name of the event, if NULL - the event is created without a name */ + /* out: the event handle */ + const char* name) /* in: the name of the event, if NULL + the event is created without a name */ { #ifdef __WIN__ os_event_t event; @@ -125,7 +125,7 @@ os_event_create( if (!event->handle) { fprintf(stderr, "InnoDB: Could not create a Windows event semaphore; Windows error %lu\n", - (ulint)GetLastError()); + (ulong) GetLastError()); } #else /* Unix */ os_event_t event; @@ -166,9 +166,9 @@ reset when a single thread is released. Works only in Windows. */ os_event_t os_event_create_auto( /*=================*/ - /* out: the event handle */ - char* name) /* in: the name of the event, if NULL - the event is created without a name */ + /* out: the event handle */ + const char* name) /* in: the name of the event, if NULL + the event is created without a name */ { os_event_t event; @@ -182,7 +182,7 @@ os_event_create_auto( if (!event->handle) { fprintf(stderr, "InnoDB: Could not create a Windows auto event semaphore; Windows error %lu\n", - (ulint)GetLastError()); + (ulong) GetLastError()); } /* Put to the list of events */ @@ -412,7 +412,7 @@ os_event_wait_multiple( FALSE, /* Wait for any 1 event */ INFINITE); /* Infinite wait time limit */ - ut_a(index >= WAIT_OBJECT_0); + ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparision */ ut_a(index < WAIT_OBJECT_0 + n); if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { @@ -430,9 +430,9 @@ mutex semaphore of InnoDB itself (mutex_t) should be used where possible. */ os_mutex_t os_mutex_create( /*============*/ - /* out: the mutex handle */ - char* name) /* in: the name of the mutex, if NULL - the mutex is created without a name */ + /* out: the mutex handle */ + const char* name) /* in: the name of the mutex, if NULL + the mutex is created without a name */ { #ifdef __WIN__ HANDLE mutex; diff --git a/innobase/os/os0thread.c b/innobase/os/os0thread.c index 59d0fdbd8c9..12a8abf3069 100644 --- a/innobase/os/os0thread.c +++ b/innobase/os/os0thread.c @@ -201,6 +201,15 @@ os_thread_exit( #endif } +#ifdef HAVE_PTHREAD_JOIN +int +os_thread_join( +/*=============*/ + os_thread_id_t thread_id) /* in: id of the thread to join */ +{ + return pthread_join(thread_id, NULL); +} +#endif /********************************************************************* Returns handle to the current thread. */ diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index fd613d5d9e7..459ab986610 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -709,8 +709,10 @@ page_cur_parse_insert_rec( "o_offset %lu\n" "mismatch index %lu, end_seg_len %lu\n" "parsed len %lu\n", - is_short, info_bits, offset, origin_offset, - mismatch_index, end_seg_len, (ulint)(ptr - ptr2)); + (ulong) is_short, (ulong) info_bits, (ulong) offset, + (ulong) origin_offset, + (ulong) mismatch_index, (ulong) end_seg_len, + (ulong) (ptr - ptr2)); fputs("Dump of 300 bytes of log:\n", stderr); ut_print_buf(stderr, ptr2, 300); diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index 76a0a950178..b5f411c43fc 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -94,13 +94,12 @@ page_dir_find_owner_slot( fprintf(stderr, "InnoDB: Probable data corruption on page %lu\n" "InnoDB: Original record ", - buf_frame_get_page_no(page)); + (ulong) buf_frame_get_page_no(page)); rec_print(stderr, original_rec); fprintf(stderr, "\n" - "InnoDB: on that page. Steps %lu.\n", steps); - + "InnoDB: on that page. Steps %lu.\n", (ulong) steps); fputs( "InnoDB: Cannot find the dir slot for record ", stderr); @@ -326,7 +325,7 @@ page_create( tuple = dtuple_create(heap, 1); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field,(char *) "infimum", strlen("infimum") + 1); + dfield_set_data(field, "infimum", sizeof "infimum"); dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); /* Set the corresponding physical record to its place in the page @@ -348,7 +347,7 @@ page_create( tuple = dtuple_create(heap, 1); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, (char *) "supremum", strlen("supremum") + 1); + dfield_set_data(field, "supremum", sizeof "supremum"); dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); supremum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); @@ -438,9 +437,9 @@ page_copy_rec_list_end_no_locks( fprintf(stderr, "InnoDB: rec offset %lu, cur1 offset %lu, cur2 offset %lu\n", - (ulint)(rec - page), - (ulint)(page_cur_get_rec(&cur1) - page), - (ulint)(page_cur_get_rec(&cur2) - new_page)); + (ulong)(rec - page), + (ulong)(page_cur_get_rec(&cur1) - page), + (ulong)(page_cur_get_rec(&cur2) - new_page)); ut_error; } @@ -554,7 +553,8 @@ byte* page_parse_delete_rec_list( /*=======================*/ /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or MLOG_LIST_START_DELETE */ + byte type, /* in: MLOG_LIST_END_DELETE or + MLOG_LIST_START_DELETE */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ page_t* page, /* in: page or NULL */ @@ -1123,9 +1123,9 @@ page_rec_print( rec_print(stderr, rec); fprintf(stderr, " n_owned: %lu; heap_no: %lu; next rec: %lu\n", - rec_get_n_owned(rec), - rec_get_heap_no(rec), - rec_get_next_offs(rec)); + (ulong) rec_get_n_owned(rec), + (ulong) rec_get_heap_no(rec), + (ulong) rec_get_next_offs(rec)); page_rec_check(rec); rec_validate(rec); @@ -1151,7 +1151,7 @@ page_dir_print( "PAGE DIRECTORY\n" "Page address %p\n" "Directory stack top at offs: %lu; number of slots: %lu\n", - page, (ulint)(page_dir_get_nth_slot(page, n - 1) - page), n); + page, (ulong)(page_dir_get_nth_slot(page, n - 1) - page), (ulong) n); for (i = 0; i < n; i++) { slot = page_dir_get_nth_slot(page, i); if ((i == pr_n) && (i < n - pr_n)) { @@ -1160,13 +1160,13 @@ page_dir_print( if ((i < pr_n) || (i >= n - pr_n)) { fprintf(stderr, "Contents of slot: %lu: n_owned: %lu, rec offs: %lu\n", - i, page_dir_slot_get_n_owned(slot), - (ulint)(page_dir_slot_get_rec(slot) - page)); + (ulong) i, (ulong) page_dir_slot_get_n_owned(slot), + (ulong)(page_dir_slot_get_rec(slot) - page)); } } fprintf(stderr, "Total of %lu records\n" "--------------------------------\n", - 2 + page_get_n_recs(page)); + (ulong) (2 + page_get_n_recs(page))); } /******************************************************************* @@ -1221,7 +1221,7 @@ page_print_list( fprintf(stderr, "Total of %lu records \n" "--------------------------------\n", - count + 1); + (ulong) (count + 1)); } /******************************************************************* @@ -1239,15 +1239,15 @@ page_header_print( "n dir slots %lu, heap top %lu\n" "Page n heap %lu, free %lu, garbage %lu\n" "Page last insert %lu, direction %lu, n direction %lu\n", - page, page_header_get_field(page, PAGE_N_RECS), - page_header_get_field(page, PAGE_N_DIR_SLOTS), - page_header_get_field(page, PAGE_HEAP_TOP), - page_header_get_field(page, PAGE_N_HEAP), - page_header_get_field(page, PAGE_FREE), - page_header_get_field(page, PAGE_GARBAGE), - page_header_get_field(page, PAGE_LAST_INSERT), - page_header_get_field(page, PAGE_DIRECTION), - page_header_get_field(page, PAGE_N_DIRECTION)); + page, (ulong) page_header_get_field(page, PAGE_N_RECS), + (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_header_get_field(page, PAGE_FREE), + (ulong) page_header_get_field(page, PAGE_GARBAGE), + (ulong) page_header_get_field(page, PAGE_LAST_INSERT), + (ulong) page_header_get_field(page, PAGE_DIRECTION), + (ulong) page_header_get_field(page, PAGE_N_DIRECTION)); } /******************************************************************* @@ -1292,15 +1292,15 @@ page_rec_validate( if (!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED)) { fprintf(stderr, "InnoDB: Dir slot of rec %lu, n owned too big %lu\n", - (ulint)(rec - page), n_owned); + (ulong)(rec - page), (ulong) n_owned); return(FALSE); } if (!(heap_no < page_header_get_field(page, PAGE_N_HEAP))) { fprintf(stderr, "InnoDB: Heap no of rec %lu too big %lu %lu\n", - (ulint)(rec - page), heap_no, - page_header_get_field(page, PAGE_N_HEAP)); + (ulong)(rec - page), (ulong) heap_no, + (ulong) page_header_get_field(page, PAGE_N_HEAP)); return(FALSE); } @@ -1366,7 +1366,7 @@ page_simple_validate( if (n_slots > UNIV_PAGE_SIZE / 4) { fprintf(stderr, - "InnoDB: Nonsensical number %lu of page dir slots\n", n_slots); + "InnoDB: Nonsensical number %lu of page dir slots\n", (ulong) n_slots); goto func_exit; } @@ -1377,8 +1377,8 @@ page_simple_validate( fprintf(stderr, "InnoDB: Record heap and dir overlap on a page, heap top %lu, dir %lu\n", - (ulint)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page), - (ulint)(page_dir_get_nth_slot(page, n_slots - 1) - page)); + (ulong)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page), + (ulong)(page_dir_get_nth_slot(page, n_slots - 1) - page)); goto func_exit; } @@ -1399,7 +1399,7 @@ page_simple_validate( if (rec > rec_heap_top) { fprintf(stderr, "InnoDB: Record %lu is above rec heap top %lu\n", - (ulint)(rec - page), (ulint)(rec_heap_top - page)); + (ulong)(rec - page), (ulong)(rec_heap_top - page)); goto func_exit; } @@ -1410,8 +1410,9 @@ page_simple_validate( fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu, rec %lu\n", - rec_get_n_owned(rec), own_count, - (ulint)(rec - page)); + (ulong) rec_get_n_owned(rec), + (ulong) own_count, + (ulong)(rec - page)); goto func_exit; } @@ -1419,7 +1420,7 @@ page_simple_validate( if (page_dir_slot_get_rec(slot) != rec) { fprintf(stderr, "InnoDB: Dir slot does not point to right rec %lu\n", - (ulint)(rec - page)); + (ulong)(rec - page)); goto func_exit; } @@ -1441,8 +1442,8 @@ page_simple_validate( || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset nonsensical %lu for rec %lu\n", - rec_get_next_offs(rec), - (ulint)(rec - page)); + (ulong) rec_get_next_offs(rec), + (ulong)(rec - page)); goto func_exit; } @@ -1452,7 +1453,7 @@ page_simple_validate( if (count > UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Page record list appears to be circular %lu\n", - count); + (ulong) count); goto func_exit; } @@ -1468,13 +1469,14 @@ page_simple_validate( if (slot_no != n_slots - 1) { fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", - slot_no, n_slots - 1); + (ulong) slot_no, (ulong) (n_slots - 1)); goto func_exit; } if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) { fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", - page_header_get_field(page, PAGE_N_RECS) + 2, count + 1); + (ulong) page_header_get_field(page, PAGE_N_RECS) + 2, + (ulong) (count + 1)); goto func_exit; } @@ -1487,7 +1489,7 @@ page_simple_validate( || rec >= page + UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Free list record has a nonsensical offset %lu\n", - (ulint)(rec - page)); + (ulong)(rec - page)); goto func_exit; } @@ -1495,7 +1497,7 @@ page_simple_validate( if (rec > rec_heap_top) { fprintf(stderr, "InnoDB: Free list record %lu is above rec heap top %lu\n", - (ulint)(rec - page), (ulint)(rec_heap_top - page)); + (ulong)(rec - page), (ulong)(rec_heap_top - page)); goto func_exit; } @@ -1505,7 +1507,7 @@ page_simple_validate( if (count > UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Page free list appears to be circular %lu\n", - count); + (ulong) count); goto func_exit; } @@ -1515,7 +1517,8 @@ page_simple_validate( if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", - page_header_get_field(page, PAGE_N_HEAP), count + 1); + (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) (count + 1)); goto func_exit; } @@ -1606,7 +1609,7 @@ page_validate( if (!(1 == cmp_rec_rec(rec, old_rec, index))) { fprintf(stderr, "InnoDB: Records in wrong order on page %lu", - buf_frame_get_page_no(page)); + (ulong) buf_frame_get_page_no(page)); dict_index_name_print(stderr, index); fputs("\nInnoDB: previous record ", stderr); rec_print(stderr, old_rec); @@ -1643,7 +1646,8 @@ page_validate( if (rec_get_n_owned(rec) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu\n", - rec_get_n_owned(rec), own_count); + (ulong) rec_get_n_owned(rec), + (ulong) own_count); goto func_exit; } @@ -1671,7 +1675,7 @@ page_validate( || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset wrong %lu\n", - rec_get_next_offs(rec)); + (ulong) rec_get_next_offs(rec)); goto func_exit; } @@ -1688,20 +1692,21 @@ page_validate( if (slot_no != n_slots - 1) { fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n", - slot_no, n_slots - 1); + (ulong) slot_no, (ulong) (n_slots - 1)); goto func_exit; } if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) { fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", - page_header_get_field(page, PAGE_N_RECS) + 2, count + 1); + (ulong) page_header_get_field(page, PAGE_N_RECS) + 2, + (ulong) (count + 1)); goto func_exit; } if (data_size != page_get_data_size(page)) { fprintf(stderr, "InnoDB: Summed data size %lu, returned by func %lu\n", - data_size, page_get_data_size(page)); + (ulong) data_size, (ulong) page_get_data_size(page)); goto func_exit; } @@ -1733,7 +1738,8 @@ page_validate( if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", - page_header_get_field(page, PAGE_N_HEAP), count + 1); + (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) count + 1); goto func_exit; } @@ -1745,7 +1751,7 @@ func_exit: if (ret == FALSE) { func_exit2: fprintf(stderr, "InnoDB: Apparent corruption in page %lu in ", - buf_frame_get_page_no(page)); + (ulong) buf_frame_get_page_no(page)); dict_index_name_print(stderr, index); putc('\n', stderr); buf_page_print(page); diff --git a/innobase/pars/lexyy.c b/innobase/pars/lexyy.c index 60804eff2fe..0112f618533 100644 --- a/innobase/pars/lexyy.c +++ b/innobase/pars/lexyy.c @@ -1,7 +1,7 @@ /* A lexical scanner generated by flex */ /* Scanner skeleton version: - * $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $ + * $Header: /home/heikki/cvsroot/ib/pars/lexyy.c,v 1.2 2003/10/30 20:27:19 heikki Exp $ */ /* This include MUST be first to keep things portable ! */ diff --git a/innobase/pars/pars0opt.c b/innobase/pars/pars0opt.c index 5cc2e39b438..7ba4af15243 100644 --- a/innobase/pars/pars0opt.c +++ b/innobase/pars/pars0opt.c @@ -1231,7 +1231,8 @@ opt_print_query_plan( fputs("Table ", stderr); dict_index_name_print(stderr, plan->index); fprintf(stderr,"; exact m. %lu, match %lu, end conds %lu\n", - plan->n_exact_match, n_fields, - UT_LIST_GET_LEN(plan->end_conds)); + (unsigned long) plan->n_exact_match, + (unsigned long) n_fields, + (unsigned long) UT_LIST_GET_LEN(plan->end_conds)); } } diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c index a4124672df0..e4b388cba82 100644 --- a/innobase/pars/pars0pars.c +++ b/innobase/pars/pars0pars.c @@ -530,7 +530,7 @@ pars_retrieve_table_def( /*====================*/ sym_node_t* sym_node) /* in: table node */ { - char* table_name; + const char* table_name; ut_a(sym_node); ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); @@ -538,7 +538,7 @@ pars_retrieve_table_def( sym_node->resolved = TRUE; sym_node->token_type = SYM_TABLE; - table_name = (char*) sym_node->name; + table_name = (const char*) sym_node->name; sym_node->table = dict_table_get_low(table_name); @@ -1713,7 +1713,8 @@ Called by yyparse on error. */ void yyerror( /*====*/ - char* s __attribute__((unused))) /* in: error message string */ + const char* s __attribute__((unused))) + /* in: error message string */ { ut_ad(s); @@ -1728,8 +1729,8 @@ Parses an SQL string returning the query graph. */ que_t* pars_sql( /*=====*/ - /* out, own: the query graph */ - char* str) /* in: SQL string */ + /* out, own: the query graph */ + const char* str) /* in: SQL string */ { sym_node_t* sym_node; mem_heap_t* heap; diff --git a/innobase/pars/pars0sym.c b/innobase/pars/pars0sym.c index 1a0608ed142..194e6677183 100644 --- a/innobase/pars/pars0sym.c +++ b/innobase/pars/pars0sym.c @@ -217,13 +217,10 @@ sym_tab_add_id( node->common.type = QUE_NODE_SYMBOL; - node->name = mem_heap_alloc(sym_tab->heap, len + 1); node->resolved = FALSE; node->indirection = NULL; - ut_memcpy(node->name, name, len); - node->name[len] = '\0'; - + node->name = mem_heap_strdupl(sym_tab->heap, name, len + 1); node->name_len = len; UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); diff --git a/innobase/que/que0que.c b/innobase/que/que0que.c index 3af4cf09539..22878dec27f 100644 --- a/innobase/que/que0que.c +++ b/innobase/que/que0que.c @@ -485,7 +485,7 @@ que_graph_free_recursive( if (thr->magic_n != QUE_THR_MAGIC_N) { fprintf(stderr, "que_thr struct appears corrupt; magic n %lu\n", - thr->magic_n); + (unsigned long) thr->magic_n); mem_analyze_corruption((byte*)thr); ut_error; } @@ -597,7 +597,7 @@ que_graph_free_recursive( default: fprintf(stderr, "que_node struct appears corrupt; type %lu\n", - que_node_get_type(node)); + (unsigned long) que_node_get_type(node)); mem_analyze_corruption((byte*)node); ut_error; } @@ -986,7 +986,8 @@ que_thr_move_to_run_state_for_mysql( { if (thr->magic_n != QUE_THR_MAGIC_N) { fprintf(stderr, - "que_thr struct appears corrupt; magic n %lu\n", thr->magic_n); + "que_thr struct appears corrupt; magic n %lu\n", + (unsigned long) thr->magic_n); mem_analyze_corruption((byte*)thr); @@ -1022,7 +1023,8 @@ que_thr_stop_for_mysql_no_error( if (thr->magic_n != QUE_THR_MAGIC_N) { fprintf(stderr, - "que_thr struct appears corrupt; magic n %lu\n", thr->magic_n); + "que_thr struct appears corrupt; magic n %lu\n", + (unsigned long) thr->magic_n); mem_analyze_corruption((byte*)thr); @@ -1094,7 +1096,7 @@ que_node_print_info( str = "UNKNOWN NODE TYPE"; } - fprintf(stderr, "Node type %lu: %s, address %p\n", type, str, node); + fprintf(stderr, "Node type %lu: %s, address %p\n", (ulong) type, str, node); } /************************************************************************** @@ -1253,10 +1255,6 @@ loop: mutex_exit(&kernel_mutex); } */ - /* TRUE below denotes that the thread is allowed to own the dictionary - mutex, though */ - ut_ad(sync_thread_levels_empty_gen(TRUE)); - loop_count++; if (next_thr != thr) { diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c index 64b6d87283d..889612deef4 100644 --- a/innobase/read/read0read.c +++ b/innobase/read/read0read.c @@ -236,16 +236,16 @@ read_view_print( ulint i; fprintf(stderr, "Read view low limit trx n:o %lu %lu\n", - ut_dulint_get_high(view->low_limit_no), - ut_dulint_get_low(view->low_limit_no)); + (ulong) ut_dulint_get_high(view->low_limit_no), + (ulong) ut_dulint_get_low(view->low_limit_no)); fprintf(stderr, "Read view up limit trx id %lu %lu\n", - ut_dulint_get_high(view->up_limit_id), - ut_dulint_get_low(view->up_limit_id)); + (ulong) ut_dulint_get_high(view->up_limit_id), + (ulong) ut_dulint_get_low(view->up_limit_id)); fprintf(stderr, "Read view low limit trx id %lu %lu\n", - ut_dulint_get_high(view->low_limit_id), - ut_dulint_get_low(view->low_limit_id)); + (ulong) ut_dulint_get_high(view->low_limit_id), + (ulong) ut_dulint_get_low(view->low_limit_id)); fprintf(stderr, "Read view individually stored trx ids:\n"); @@ -253,7 +253,7 @@ read_view_print( for (i = 0; i < n_ids; i++) { fprintf(stderr, "Read view trx id %lu %lu\n", - ut_dulint_get_high(read_view_get_nth_trx_id(view, i)), - ut_dulint_get_low(read_view_get_nth_trx_id(view, i))); + (ulong) ut_dulint_get_high(read_view_get_nth_trx_id(view, i)), + (ulong) ut_dulint_get_low(read_view_get_nth_trx_id(view, i))); } } diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index 5dae9f7bc74..6e8f3d82ef3 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -63,10 +63,11 @@ must be a copy of the the one in ha_innobase.cc! */ extern int innobase_mysql_cmp( -/*===============*/ +/*===============*/ /* out: 1, 0, -1, if a is greater, equal, less than b, respectively */ - int mysql_type, /* in: MySQL type */ + int mysql_type, /* in: MySQL type */ + uint charset_number, /* in: number of the charset */ unsigned char* a, /* in: data field */ unsigned int a_length, /* in: data field length, not UNIV_SQL_NULL */ @@ -99,16 +100,28 @@ cmp_types_are_equal( dtype_t* type1, /* in: type 1 */ dtype_t* type2) /* in: type 2 */ { - if ((type1->mtype == DATA_VARCHAR && type2->mtype == DATA_CHAR) - || (type1->mtype == DATA_CHAR && type2->mtype == DATA_VARCHAR) - || (type1->mtype == DATA_FIXBINARY && type2->mtype == DATA_BINARY) - || (type1->mtype == DATA_BINARY && type2->mtype == DATA_FIXBINARY) - || (type1->mtype == DATA_MYSQL && type2->mtype == DATA_VARMYSQL) - || (type1->mtype == DATA_VARMYSQL && type2->mtype == DATA_MYSQL)) { - - return(TRUE); + if (dtype_is_non_binary_string_type(type1->mtype, type1->prtype) + && dtype_is_non_binary_string_type(type2->mtype, type2->prtype)) { + + /* Both are non-binary string types: they can be compared if + and only if the charset-collation is the same */ + + if (dtype_get_charset_coll(type1->prtype) + == dtype_get_charset_coll(type2->prtype)) { + return(TRUE); + } + + return(FALSE); } + if (dtype_is_binary_string_type(type1->mtype, type1->prtype) + && dtype_is_binary_string_type(type2->mtype, type2->prtype)) { + + /* Both are binary string types: they can be compared */ + + return(TRUE); + } + if (type1->mtype != type2->mtype) { return(FALSE); @@ -130,11 +143,6 @@ cmp_types_are_equal( return(FALSE); } - if (type1->mtype == DATA_BLOB && (type1->prtype & DATA_BINARY_TYPE) - != (type2->prtype & DATA_BINARY_TYPE)) { - return(FALSE); - } - return(TRUE); } @@ -271,10 +279,12 @@ cmp_whole_field( return(innobase_mysql_cmp( (int)(type->prtype & DATA_MYSQL_TYPE_MASK), + (uint)dtype_get_charset_coll(type->prtype), a, a_length, b, b_length)); default: fprintf(stderr, - "InnoDB: unknown type number %lu\n", data_type); + "InnoDB: unknown type number %lu\n", + (ulong) data_type); ut_error; } @@ -323,7 +333,9 @@ cmp_data_data_slow( if (cur_type->mtype >= DATA_FLOAT || (cur_type->mtype == DATA_BLOB - && (cur_type->prtype & DATA_NONLATIN1))) { + && 0 == (cur_type->prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(cur_type->prtype) != + data_mysql_latin1_swedish_charset_coll)) { return(cmp_whole_field(cur_type, data1, len1, data2, len2)); } @@ -524,8 +536,10 @@ cmp_dtuple_rec_with_match( } if (cur_type->mtype >= DATA_FLOAT - || (cur_type->mtype == DATA_BLOB - && (cur_type->prtype & DATA_NONLATIN1))) { + || (cur_type->mtype == DATA_BLOB + && 0 == (cur_type->prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(cur_type->prtype) != + data_mysql_latin1_swedish_charset_coll)) { ret = cmp_whole_field(cur_type, dfield_get_data(dtuple_field), dtuple_f_len, @@ -846,8 +860,10 @@ cmp_rec_rec_with_match( } if (cur_type->mtype >= DATA_FLOAT - || (cur_type->mtype == DATA_BLOB - && (cur_type->prtype & DATA_NONLATIN1))) { + || (cur_type->mtype == DATA_BLOB + && 0 == (cur_type->prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(cur_type->prtype) != + data_mysql_latin1_swedish_charset_coll)) { ret = cmp_whole_field(cur_type, rec1_b_ptr, rec1_f_len, diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c index b9fa840d66d..1db89241dff 100644 --- a/innobase/rem/rem0rec.c +++ b/innobase/rem/rem0rec.c @@ -107,7 +107,7 @@ rec_get_nth_field( if (n > 1024) { fprintf(stderr, "Error: trying to access field %lu in rec\n", - n); + (ulong) n); ut_error; } @@ -474,7 +474,7 @@ rec_validate( if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { fprintf(stderr, "InnoDB: Error: record has %lu fields\n", - n_fields); + (ulong) n_fields); return(FALSE); } @@ -483,8 +483,8 @@ rec_validate( if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { fprintf(stderr, - "InnoDB: Error: record field %lu len %lu\n", i, - len); + "InnoDB: Error: record field %lu len %lu\n", (ulong) i, + (ulong) len); return(FALSE); } @@ -502,7 +502,8 @@ rec_validate( if (len_sum != (ulint)(rec_get_end(rec) - rec)) { fprintf(stderr, "InnoDB: Error: record len should be %lu, len %lu\n", - len_sum, (ulint)(rec_get_end(rec) - rec)); + (ulong) len_sum, + (ulong) (rec_get_end(rec) - rec)); return(FALSE); } @@ -531,14 +532,14 @@ rec_print( fprintf(file, "PHYSICAL RECORD: n_fields %lu;" " 1-byte offs %s; info bits %lu\n", - n, rec_get_1byte_offs_flag(rec) ? "TRUE" : "FALSE", - rec_get_info_bits(rec)); + (ulong) n, rec_get_1byte_offs_flag(rec) ? "TRUE" : "FALSE", + (ulong) rec_get_info_bits(rec)); for (i = 0; i < n; i++) { data = rec_get_nth_field(rec, i, &len); - fprintf(file, " %lu:", i); + fprintf(file, " %lu:", (ulong) i); if (len != UNIV_SQL_NULL) { if (len <= 30) { @@ -551,7 +552,7 @@ rec_print( } } else { fprintf(file, " SQL NULL, size %lu ", - rec_get_nth_field_size(rec, i)); + (ulong) rec_get_nth_field_size(rec, i)); } putc(';', file); diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index 458970da4e2..0da749212d2 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -543,7 +543,7 @@ static void row_ins_foreign_report_err( /*=======================*/ - char* errstr, /* in: error string from the viewpoint + const char* errstr, /* in: error string from the viewpoint of the parent table */ que_thr_t* thr, /* in: query thread whose run_node is an update node */ @@ -704,6 +704,8 @@ row_ins_foreign_check_on_constraint( ulint n_to_update; ulint err; ulint i; + + ut_a(thr && foreign && pcur && mtr); @@ -718,7 +720,7 @@ row_ins_foreign_check_on_constraint( (DICT_FOREIGN_ON_DELETE_CASCADE | DICT_FOREIGN_ON_DELETE_SET_NULL))) { - row_ins_foreign_report_err((char*)"Trying to delete", + row_ins_foreign_report_err("Trying to delete", thr, foreign, btr_pcur_get_rec(pcur), entry); @@ -731,7 +733,7 @@ row_ins_foreign_check_on_constraint( /* This is an UPDATE */ - row_ins_foreign_report_err((char*)"Trying to update", + row_ins_foreign_report_err("Trying to update", thr, foreign, btr_pcur_get_rec(pcur), entry); @@ -792,7 +794,7 @@ row_ins_foreign_check_on_constraint( err = DB_ROW_IS_REFERENCED; row_ins_foreign_report_err( -(char*)"Trying an update, possibly causing a cyclic cascaded update\n" +"Trying an update, possibly causing a cyclic cascaded update\n" "in the child table,", thr, foreign, btr_pcur_get_rec(pcur), entry); goto nonstandard_exit_func; @@ -927,7 +929,7 @@ row_ins_foreign_check_on_constraint( err = DB_ROW_IS_REFERENCED; row_ins_foreign_report_err( -(char*)"Trying a cascaded update where the updated value in the child\n" +"Trying a cascaded update where the updated value in the child\n" "table would not fit in the length of the column, or the value would\n" "be NULL and the column is declared as not NULL in the child table,", thr, foreign, btr_pcur_get_rec(pcur), entry); @@ -1245,7 +1247,7 @@ run_again: } } else { row_ins_foreign_report_err( - (char*)"Trying to delete or update", + "Trying to delete or update", thr, foreign, rec, entry); err = DB_ROW_IS_REFERENCED; @@ -1704,6 +1706,7 @@ row_ins_index_entry_low( ulint modify = 0; /* remove warning */ rec_t* insert_rec; rec_t* rec; + rec_t* first_rec; ulint err; ulint n_unique; big_rec_t* big_rec = NULL; @@ -1736,6 +1739,14 @@ row_ins_index_entry_low( goto function_exit; } + first_rec = page_rec_get_next(page_get_infimum_rec( + buf_frame_align(btr_cur_get_rec(&cursor)))); + + if (!page_rec_is_supremum(first_rec)) { + ut_a((rec_get_n_fields(first_rec)) + == dtuple_get_n_fields(entry)); + } + n_unique = dict_index_get_n_unique(index); if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 98ab1a1e754..556c80c948d 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -22,12 +22,15 @@ Created 9/17/2000 Heikki Tuuri #include "dict0dict.h" #include "dict0crea.h" #include "dict0load.h" +#include "dict0boot.h" #include "trx0roll.h" #include "trx0purge.h" #include "lock0lock.h" #include "rem0cmp.h" #include "log0log.h" #include "btr0sea.h" +#include "fil0fil.h" +#include "ibuf0ibuf.h" /* A dummy variable used to fool the compiler */ ibool row_mysql_identically_false = FALSE; @@ -104,6 +107,19 @@ row_mysql_read_var_ref_noninline( } /*********************************************************************** +Frees the blob heap in prebuilt when no longer needed. */ + +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct of a + ha_innobase:: table handle */ +{ + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; +} + +/*********************************************************************** Stores a reference to a BLOB in the MySQL format. */ void @@ -304,17 +320,18 @@ handle_new_error: exit(1); } else if (err == DB_CORRUPTION) { - fputs( + fputs( "InnoDB: We detected index corruption in an InnoDB type table.\n" "InnoDB: You have to dump + drop + reimport the table or, in\n" "InnoDB: a case of widespread corruption, dump all InnoDB\n" "InnoDB: tables and recreate the whole InnoDB tablespace.\n" "InnoDB: If the mysqld server crashes after the startup or when\n" "InnoDB: you dump the tables, look at section 6.1 of\n" - "InnoDB: http://www.innodb.com/ibman.php for help.\n", stderr); + "InnoDB: http://www.innodb.com/ibman.html for help.\n", stderr); } else { - fprintf(stderr, "InnoDB: unknown error code %lu\n", err); + fprintf(stderr, "InnoDB: unknown error code %lu\n", + (ulong) err); ut_error; } @@ -424,8 +441,9 @@ row_prebuilt_free( || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" -"InnoDB: table handle. Magic n %lu, magic n2 %lu, table name ", - prebuilt->magic_n, prebuilt->magic_n2); +"InnoDB: table handle. Magic n %lu, magic n2 %lu, table name", + (ulong) prebuilt->magic_n, + (ulong) prebuilt->magic_n2); ut_print_name(stderr, prebuilt->table->name); putc('\n', stderr); @@ -507,7 +525,7 @@ row_update_prebuilt_trx( fprintf(stderr, "InnoDB: Error: trying to use a corrupt\n" "InnoDB: trx handle. Magic n %lu\n", - trx->magic_n); + (ulong) trx->magic_n); mem_analyze_corruption((byte*)trx); @@ -517,8 +535,8 @@ row_update_prebuilt_trx( if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to use a corrupt\n" - "InnoDB: table handle. Magic n %lu, table name ", - prebuilt->magic_n); + "InnoDB: table handle. Magic n %lu, table name", + (ulong) prebuilt->magic_n); ut_print_name(stderr, prebuilt->table->name); putc('\n', stderr); @@ -674,7 +692,7 @@ row_lock_table_autoinc_for_mysql( return(DB_SUCCESS); } - trx->op_info = (char *) "setting auto-inc lock"; + trx->op_info = "setting auto-inc lock"; if (node == NULL) { row_get_prebuilt_insert_row(prebuilt); @@ -710,14 +728,14 @@ run_again: goto run_again; } - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } que_thr_stop_for_mysql_no_error(thr, trx); - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -758,7 +776,7 @@ row_lock_table_for_mysql( ut_ad(trx); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); - trx->op_info = (char *) "setting table lock"; + trx->op_info = "setting table lock"; if (prebuilt->sel_graph == NULL) { /* Build a dummy select query graph */ @@ -795,14 +813,14 @@ run_again: goto run_again; } - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } que_thr_stop_for_mysql_no_error(thr, trx); - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -831,8 +849,8 @@ row_insert_for_mysql( if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" - "InnoDB: table handle. Magic n %lu, table name ", - prebuilt->magic_n); + "InnoDB: table handle. Magic n %lu, table name", + (ulong) prebuilt->magic_n); ut_print_name(stderr, prebuilt->table->name); putc('\n', stderr); @@ -853,7 +871,7 @@ row_insert_for_mysql( return(DB_ERROR); } - trx->op_info = (char *) "inserting"; + trx->op_info = "inserting"; trx_start_if_not_started(trx); @@ -894,7 +912,7 @@ run_again: goto run_again; } - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } @@ -911,7 +929,7 @@ run_again: } row_update_statistics_if_needed(prebuilt->table); - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -1046,8 +1064,8 @@ row_update_for_mysql( if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" - "InnoDB: table handle. Magic n %lu, table name ", - prebuilt->magic_n); + "InnoDB: table handle. Magic n %lu, table name", + (ulong) prebuilt->magic_n); ut_print_name(stderr, prebuilt->table->name); putc('\n', stderr); @@ -1068,7 +1086,7 @@ row_update_for_mysql( return(DB_ERROR); } - trx->op_info = (char *) "updating or deleting"; + trx->op_info = "updating or deleting"; trx_start_if_not_started(trx); @@ -1115,7 +1133,7 @@ run_again: if (err == DB_RECORD_NOT_FOUND) { trx->error_state = DB_SUCCESS; - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -1126,7 +1144,7 @@ run_again: goto run_again; } - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } @@ -1145,7 +1163,7 @@ run_again: row_update_statistics_if_needed(prebuilt->table); - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -1350,7 +1368,8 @@ row_mysql_lock_data_dictionary( /*===========================*/ trx_t* trx) /* in: transaction */ { - ut_a(trx->dict_operation_lock_mode == 0); + ut_a(trx->dict_operation_lock_mode == 0 + || trx->dict_operation_lock_mode == RW_X_LATCH); /* Serialize data dictionary operations with dictionary mutex: no deadlocks or lock waits can occur then in these operations */ @@ -1385,6 +1404,7 @@ Does a table creation operation for MySQL. If the name of the table to be created is equal with one of the predefined magic table names, then this also starts printing the corresponding monitor output by the master thread. */ + int row_create_table_for_mysql( /*=======================*/ @@ -1420,7 +1440,7 @@ row_create_table_for_mysql( return(DB_ERROR); } - trx->op_info = (char *) "creating table"; + trx->op_info = "creating table"; if (row_mysql_is_system_table(table->name)) { @@ -1534,9 +1554,8 @@ row_create_table_for_mysql( ut_print_name(stderr, table->name); fputs(" because tablespace full\n", stderr); row_drop_table_for_mysql(table->name, trx, FALSE); - } else { - ut_a(err == DB_DUPLICATE_KEY); + } else if (err == DB_DUPLICATE_KEY) { ut_print_timestamp(stderr); fputs(" InnoDB: Error: table ", stderr); @@ -1554,13 +1573,16 @@ row_create_table_for_mysql( "InnoDB: You can look for further help from section 15.1 of\n" "InnoDB: http://www.innodb.com/ibman.php\n", stderr); } + + /* We may also get err == DB_ERROR if the .ibd file for the + table already exists */ trx->error_state = DB_SUCCESS; } que_graph_free((que_t*) que_node_get_parent(thr)); - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -1589,7 +1611,7 @@ row_create_index_for_mysql( #endif /* UNIV_SYNC_DEBUG */ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); - trx->op_info = (char *) "creating index"; + trx->op_info = "creating index"; /* Check that the same column does not appear twice in the index. Starting from 4.0.14, InnoDB should be able to cope with that, but @@ -1630,6 +1652,9 @@ row_create_index_for_mysql( trx->dict_operation = TRUE; + /* Note that the space id where we store the index is inherited from + the table in dict_build_index_def_step() in dict0crea.c. */ + node = ind_create_graph_create(index, heap); thr = pars_complete_graph_for_exec(node, trx, heap); @@ -1642,7 +1667,6 @@ row_create_index_for_mysql( que_graph_free((que_t*) que_node_get_parent(thr)); error_handling: - if (err != DB_SUCCESS) { /* We have special error handling here */ @@ -1655,7 +1679,7 @@ error_handling: trx->error_state = DB_SUCCESS; } - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -1672,15 +1696,16 @@ constraints which reference this table are ok. */ int row_table_add_foreign_constraints( /*==============================*/ - /* out: error code or DB_SUCCESS */ - trx_t* trx, /* in: transaction */ - char* sql_string, /* in: table create statement where - foreign keys are declared like: + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: FOREIGN KEY (a, b) REFERENCES table2(c, d), - table2 can be written also with the database - name before it: test.table2 */ - char* name) /* in: table full name in the normalized form - database_name/table_name */ + table2 can be written also with the + database name before it: test.table2 */ + const char* name) /* in: table full name in the + normalized form + database_name/table_name */ { ulint err; @@ -1690,7 +1715,7 @@ row_table_add_foreign_constraints( #endif /* UNIV_SYNC_DEBUG */ ut_a(sql_string); - trx->op_info = (char *) "adding foreign keys"; + trx->op_info = "adding foreign keys"; trx_start_if_not_started(trx); @@ -1734,8 +1759,8 @@ static int row_drop_table_for_mysql_in_background( /*===================================*/ - /* out: error code or DB_SUCCESS */ - char* name) /* in: table name */ + /* out: error code or DB_SUCCESS */ + const char* name) /* in: table name */ { ulint error; trx_t* trx; @@ -1901,6 +1926,275 @@ row_add_table_to_background_drop_list( } /************************************************************************* +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. + +How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do DISCARD TABLESPACE. Then there are no running +queries on the table. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: we remove all entries for the tablespace in the insert +buffer tree; as long as the tablespace mem object does not exist, ongoing +insert buffer page merges are discarded in buf0rea.c. If we recreate the +tablespace mem object with IMPORT TABLESPACE later, then the tablespace will +have the same id, but the tablespace_version field in the mem object is +different, and ongoing old insert buffer page merges get discarded. +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. */ + +int +row_discard_tablespace_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dulint new_id; + dict_table_t* table; + que_thr_t* thr; + que_t* graph = NULL; + ibool success; + ulint err; + char* buf; + + static const char discard_tablespace_proc1[] = + "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "new_id CHAR;\n" + "new_id_low INT;\n" + "new_id_high INT;\n" + "table_name CHAR;\n" + "BEGIN\n" + "table_name := "; + static const char discard_tablespace_proc2[] = + ";\n" + "new_id_high := %lu;\n" + "new_id_low := %lu;\n" + "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n" + "SELECT ID INTO old_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = table_name;\n" + "IF (SQL %% NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "UPDATE SYS_TABLES SET ID = new_id\n" + "WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "discarding tablespace"; + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, name); + fputs("\n" +"InnoDB: is in the system tablespace 0 which cannot be discarded\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + buf = mem_alloc((sizeof discard_tablespace_proc1) + + (sizeof discard_tablespace_proc2) + + 20 + ut_strlenq(name, '\'')); + + memcpy(buf, discard_tablespace_proc1, sizeof discard_tablespace_proc1); + sprintf(ut_strcpyq(buf + (sizeof discard_tablespace_proc1 - 1), + '\'', name), + discard_tablespace_proc2, + (ulong) ut_dulint_get_high(new_id), + (ulong) ut_dulint_get_low(new_id)); + + graph = pars_sql(buf); + + ut_a(graph); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + dict_table_change_id_in_cache(table, new_id); + + success = fil_discard_tablespace(table->space); + + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + + err = DB_ERROR; + } else { + /* Set the flag which tells that now it is legal to + IMPORT a tablespace for this table */ + table->tablespace_discarded = TRUE; + table->ibd_file_missing = TRUE; + } + } +funct_exit: + row_mysql_unlock_data_dictionary(trx); + + if (graph) { + que_graph_free(graph); + } + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + + return((int) err); +} + +/********************************************************************* +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. */ + +int +row_import_tablespace_for_mysql( +/*============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + ibool success; + dulint current_lsn; + ulint err = DB_SUCCESS; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx_start_if_not_started(trx); + + trx->op_info = "importing tablespace"; + + current_lsn = log_get_lsn(); + + /* It is possible, though very improbable, that the lsn's in the + tablespace to be imported have risen above the current system lsn, if + a lengthy purge, ibuf merge, or rollback was performed on a backup + taken with ibbackup. If that is the case, reset page lsn's in the + file. We assume that mysqld was shut down after it performed these + cleanup operations on the .ibd file, so that it stamped the latest lsn + to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file. + + TODO: reset also the trx id's in clustered index records and write + a new space id to each data page. That would allow us to import clean + .ibd files from another MySQL installation. */ + + success = fil_reset_too_high_lsns(name, current_lsn); + + if (!success) { + err = DB_ERROR; + + row_mysql_lock_data_dictionary(trx); + + goto funct_exit; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, name); + fputs("\n" +"InnoDB: is in the system tablespace 0 which cannot be imported\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (!table->tablespace_discarded) { + ut_print_timestamp(stderr); + fputs( +" InnoDB: Error: you are trying to IMPORT a tablespace\n" +"InnoDB: ", stderr); + ut_print_name(stderr, name); + fputs(", though you have not called DISCARD on it yet\n" +"InnoDB: during the lifetime of the mysqld process!\n", stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + success = fil_open_single_table_tablespace(table->space, table->name); + + if (success) { + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; + } else { + err = DB_ERROR; + } + +funct_exit: + row_mysql_unlock_data_dictionary(trx); + + trx_commit_for_mysql(trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* Drops a table for MySQL. If the name of the table to be dropped is equal with one of the predefined magic table names, then this also stops printing the corresponding monitor output by the master thread. */ @@ -1908,18 +2202,20 @@ the corresponding monitor output by the master thread. */ int row_drop_table_for_mysql( /*=====================*/ - /* out: error code or DB_SUCCESS */ - char* name, /* in: table name */ - trx_t* trx, /* in: transaction handle */ - ibool drop_db)/* in: TRUE=dropping whole database */ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx, /* in: transaction handle */ + ibool drop_db)/* in: TRUE=dropping whole database */ { dict_foreign_t* foreign; dict_table_t* table; + ulint space_id; que_thr_t* thr; que_t* graph; ulint err; const char* table_name; ulint namelen; + ibool success; ibool locked_dictionary = FALSE; char* quoted_name; char* sql; @@ -1998,12 +2294,12 @@ row_drop_table_for_mysql( "InnoDB: database modifications by the user. Shut down\n" "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" "InnoDB: with raw, and innodb_force_... is removed.\n", - stderr); + stderr); return(DB_ERROR); } - trx->op_info = (char *) "dropping table"; + trx->op_info = "dropping table"; trx_start_if_not_started(trx); @@ -2028,7 +2324,6 @@ row_drop_table_for_mysql( } else if (namelen == sizeof S_innodb_lock_monitor && !memcmp(table_name, S_innodb_lock_monitor, sizeof S_innodb_lock_monitor)) { - srv_print_innodb_monitor = FALSE; srv_print_innodb_lock_monitor = FALSE; } else if (namelen == sizeof S_innodb_tablespace_monitor @@ -2043,6 +2338,21 @@ row_drop_table_for_mysql( srv_print_innodb_table_monitor = FALSE; } + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(name != NULL); + + if (srv_created_new_raw) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + quoted_name = mem_strdupq(name, '\''); namelen = strlen(quoted_name); sql = mem_alloc((sizeof str1) + (sizeof str2) - 2 + 1 + namelen); @@ -2185,16 +2495,41 @@ row_drop_table_for_mysql( ut_error; } else { + space_id = table->space; dict_table_remove_from_cache(table); if (dict_load_table(name) != NULL) { ut_print_timestamp(stderr); - fputs(" InnoDB: Error: dropping of table ", + fputs(" InnoDB: Error: not able to remove table ", stderr); ut_print_name(stderr, name); - fputs(" failed!\n", stderr); + fputs(" from the dictionary cache!\n", stderr); err = DB_ERROR; } + + /* Do not drop possible .ibd tablespace if something went + wrong: we do not want to delete valuable data of the user */ + + if (err == DB_SUCCESS && space_id > 0) { + if (!fil_space_for_table_exists_in_mem(space_id, name, + FALSE, TRUE)) { + err = DB_ERROR; + + goto funct_exit; + } + + success = fil_delete_tablespace(space_id); + + if (!success) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: not able to delete tablespace %lu of table ", + (ulong) space_id); + ut_print_name(stderr, name); + fputs("!\n", stderr); + err = DB_ERROR; + } + } } funct_exit: @@ -2206,7 +2541,7 @@ funct_exit: trx_commit_for_mysql(trx); - trx->op_info = (char *) ""; + trx->op_info = ""; srv_wake_master_thread(); @@ -2219,9 +2554,9 @@ Drops a database for MySQL. */ int row_drop_database_for_mysql( /*========================*/ - /* out: error code or DB_SUCCESS */ - char* name, /* in: database name which ends to '/' */ - trx_t* trx) /* in: transaction handle */ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: database name which ends to '/' */ + trx_t* trx) /* in: transaction handle */ { dict_table_t* table; char* table_name; @@ -2232,7 +2567,7 @@ row_drop_database_for_mysql( ut_a(name != NULL); ut_a(name[namelen - 1] == '/'); - trx->op_info = (char *) "dropping database"; + trx->op_info = "dropping database"; trx_start_if_not_started(trx); loop: @@ -2286,7 +2621,7 @@ loop: trx_commit_for_mysql(trx); - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } @@ -2311,10 +2646,10 @@ Renames a table for MySQL. */ int row_rename_table_for_mysql( /*=======================*/ - /* out: error code or DB_SUCCESS */ - char* old_name, /* in: old table name */ - char* new_name, /* in: new table name */ - trx_t* trx) /* in: transaction handle */ + /* out: error code or DB_SUCCESS */ + const char* old_name, /* in: old table name */ + const char* new_name, /* in: new table name */ + trx_t* trx) /* in: transaction handle */ { dict_table_t* table; que_thr_t* thr; @@ -2403,6 +2738,7 @@ row_rename_table_for_mysql( ibool recovering_temp_table = FALSE; ulint len; ulint i; + ibool success; /* length of database name; 0 if not renaming to a temporary table */ ulint db_name_len; char* sql; @@ -2436,7 +2772,7 @@ row_rename_table_for_mysql( return(DB_ERROR); } - trx->op_info = (char *) "renaming table"; + trx->op_info = "renaming table"; trx_start_if_not_started(trx); if (row_mysql_is_recovered_tmp_table(new_name)) { @@ -2453,7 +2789,29 @@ row_rename_table_for_mysql( if (!table) { err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, old_name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is trying to rename the table.\n" + "InnoDB: Have you copied the .frm file of the table to the\n" + "InnoDB: MySQL database directory from another database?\n" + "InnoDB: You can look for further help from section 15.1 of\n" + "InnoDB: http://www.innodb.com/ibman.php\n", stderr); + goto funct_exit; + } + + if (table->ibd_file_missing) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, old_name); + fputs( + " does not have an .ibd file in the database directory.\n" + "InnoDB: You can look for further help from section 15.1 of\n" + "InnoDB: http://www.innodb.com/ibman.php\n", stderr); goto funct_exit; } @@ -2480,7 +2838,7 @@ row_rename_table_for_mysql( goto funct_exit; } - + /* reserve space for all database names */ len += 2 * n_constraints_to_drop * (ut_strlenq(old_name, '\'') @@ -2580,20 +2938,18 @@ row_rename_table_for_mysql( if (err != DB_SUCCESS) { if (err == DB_DUPLICATE_KEY) { ut_print_timestamp(stderr); - - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, new_name); - fputs(" exists in the InnoDB internal data\n" + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, new_name); + fputs(" exists in the InnoDB internal data\n" "InnoDB: dictionary though MySQL is trying rename table ", stderr); - ut_print_name(stderr, old_name); - fputs(" to it.\n" + ut_print_name(stderr, old_name); + fputs(" to it.\n" "InnoDB: Have you deleted the .frm file and not used DROP TABLE?\n" "InnoDB: You can look for further help from section 15.1 of\n" - "InnoDB: http://www.innodb.com/ibman.php\n" + "InnoDB: http://www.innodb.com/ibman.php\n" "InnoDB: If table ", stderr); - ut_print_name(stderr, new_name); - fputs( - " is a temporary table #sql..., then it can be that\n" + ut_print_name(stderr, new_name); + fputs(" is a temporary table #sql..., then it can be that\n" "InnoDB: there are still queries running on the table, and it will be\n" "InnoDB: dropped automatically when the queries end.\n" "InnoDB: You can drop the orphaned table inside InnoDB by\n" @@ -2602,13 +2958,30 @@ row_rename_table_for_mysql( "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n" "InnoDB: succeed.\n", stderr); } - trx->error_state = DB_SUCCESS; trx_general_rollback_for_mysql(trx, FALSE, NULL); trx->error_state = DB_SUCCESS; } else { - ut_a(dict_table_rename_in_cache(table, new_name, - !row_is_mysql_tmp_table_name(new_name))); + /* The following call will also rename the .ibd data file if + the table is stored in a single-table tablespace */ + + success = dict_table_rename_in_cache(table, new_name, + !row_is_mysql_tmp_table_name(new_name)); + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); + fputs(" InnoDB: Error in table rename, cannot rename ", + stderr); + ut_print_name(stderr, old_name); + fputs(" to ", stderr); + ut_print_name(stderr, new_name); + putc('\n', stderr); + err = DB_ERROR; + + goto funct_exit; + } if (row_is_mysql_tmp_table_name(old_name)) { @@ -2622,20 +2995,17 @@ row_rename_table_for_mysql( err = dict_load_foreigns(new_name); if (err != DB_SUCCESS) { - ut_print_timestamp(stderr); - - fputs(" InnoDB: Error: in ALTER TABLE table ", + fputs(" InnoDB: Error: in ALTER TABLE ", stderr); ut_print_name(stderr, new_name); fputs("\n" - "InnoDB: has or is referenced in foreign key constraints\n" - "InnoDB: which are not compatible with the new table definition.\n", + "InnoDB: has or is referenced in foreign key constraints\n" + "InnoDB: which are not compatible with the new table definition.\n", stderr); - + ut_a(dict_table_rename_in_cache(table, old_name, FALSE)); - trx->error_state = DB_SUCCESS; trx_general_rollback_for_mysql(trx, FALSE, NULL); @@ -2643,8 +3013,8 @@ row_rename_table_for_mysql( } } } -funct_exit: - if (!recovering_temp_table) { +funct_exit: + if (!recovering_temp_table) { row_mysql_unlock_data_dictionary(trx); } @@ -2658,7 +3028,7 @@ funct_exit: trx_commit_for_mysql(trx); - trx->op_info = (char *) ""; + trx->op_info = ""; return((int) err); } @@ -2795,8 +3165,8 @@ row_check_table_for_mysql( ulint n_rows_in_table = ULINT_UNDEFINED; ulint ret = DB_SUCCESS; ulint old_isolation_level; - - prebuilt->trx->op_info = (char *) "checking table"; + + prebuilt->trx->op_info = "checking table"; old_isolation_level = prebuilt->trx->isolation_level; @@ -2827,21 +3197,21 @@ row_check_table_for_mysql( ret = DB_ERROR; } - /* fprintf(stderr, "%lu entries in index ", n_rows); - ut_print_name(stderr, index->name); - putc('\n', stderr); */ + /* fprintf(stderr, "%lu entries in index %s\n", n_rows, + index->name); */ if (index == dict_table_get_first_index(table)) { n_rows_in_table = n_rows; } else if (n_rows != n_rows_in_table) { ret = DB_ERROR; - - fputs("InnoDB: Error: ", stderr); + + fputs("Error: ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, " contains %lu entries, should be %lu\n", - n_rows, n_rows_in_table); + (ulong) n_rows, + (ulong) n_rows_in_table); } } @@ -2864,7 +3234,7 @@ row_check_table_for_mysql( srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ mutex_exit(&kernel_mutex); - prebuilt->trx->op_info = (char *) ""; + prebuilt->trx->op_info = ""; return(ret); } diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index 2ddc60613fc..8f5f0831dc6 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -519,6 +519,16 @@ row_purge_parse_undo_rec( return(FALSE); } + if (node->table->ibd_file_missing) { + /* We skip purge of missing .ibd files */ + + node->table = NULL; + + row_mysql_unfreeze_data_dictionary(trx); + + return(FALSE); + } + clust_index = dict_table_get_first_index(node->table); if (clust_index == NULL) { diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index e0bf4684214..8a0da2851a7 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -1756,7 +1756,7 @@ row_sel_step( return(NULL); } else { /* SQL error detected */ - fprintf(stderr, "SQL error %lu\n", err); + fprintf(stderr, "SQL error %lu\n", (ulong) err); que_thr_handle_error(thr, DB_ERROR, NULL, 0); @@ -1806,7 +1806,7 @@ fetch_step( if (sel_node->state == SEL_NODE_CLOSED) { /* SQL error detected */ - fprintf(stderr, "SQL error %lu\n", (ulint)DB_ERROR); + fprintf(stderr, "SQL error %lu\n", (ulong)DB_ERROR); que_thr_handle_error(thr, DB_ERROR, NULL, 0); @@ -1903,6 +1903,7 @@ row_sel_convert_mysql_key_to_innobase( ulint key_len) /* in: MySQL key value length */ { byte* original_buf = buf; + byte* original_key_ptr = key_ptr; dict_field_t* field; dfield_t* dfield; ulint data_offset; @@ -2027,7 +2028,15 @@ row_sel_convert_mysql_key_to_innobase( ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Warning: using a partial-field key prefix in search\n"); + " InnoDB: Warning: using a partial-field key prefix in search.\n" + "InnoDB: Table name %s, index name %s. Last data field length %lu bytes,\n" + "InnoDB: key ptr now exceeds key end by %lu bytes.\n" + "InnoDB: Key value in the MySQL format:\n", index->table_name, index->name, + (ulong) data_field_len, + (ulong) (key_ptr - key_end)); + fflush(stderr); + ut_print_buf(stderr, original_key_ptr, key_len); + fprintf(stderr, "\n"); if (!is_null) { dfield->len -= (ulint)(key_ptr - key_end); @@ -2064,11 +2073,11 @@ row_sel_store_row_id_to_prebuilt( if (len != DATA_ROW_ID_LEN) { fprintf(stderr, -"InnoDB: Error: Row id field is wrong length %lu in ", len); +"InnoDB: Error: Row id field is wrong length %lu in ", (ulong) len); dict_index_name_print(stderr, index); fprintf(stderr, "\n" "InnoDB: Field number %lu, record:\n", - dict_index_get_sys_col_pos(index, DATA_ROW_ID)); + (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID)); rec_print(stderr, index_rec); putc('\n', stderr); ut_error; @@ -2149,9 +2158,13 @@ Note that the template in prebuilt may advise us to copy only a few columns to mysql_rec, other columns are left blank. All columns may not be needed in the query. */ static -void +ibool row_sel_store_mysql_rec( /*====================*/ + /* out: TRUE if success, FALSE if + could not allocate memory for a BLOB + (though we may also assert in that + case) */ byte* mysql_rec, /* out: row in the MySQL format */ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ rec_t* rec) /* in: Innobase record in the index @@ -2163,6 +2176,7 @@ row_sel_store_mysql_rec( byte* data; ulint len; byte* blob_buf; + int pad_char; ulint i; ut_ad(prebuilt->mysql_template); @@ -2172,9 +2186,10 @@ row_sel_store_mysql_rec( prebuilt->blob_heap = NULL; } - /* Mark all columns as not SQL NULL */ + /* MySQL assumes that all columns have the SQL NULL bit set unless it + is a nullable column with a non-NULL value */ - memset(mysql_rec, '\0', prebuilt->null_bitmap_len); + memset(mysql_rec, 0xFF, prebuilt->null_bitmap_len); for (i = 0; i < prebuilt->n_template; i++) { @@ -2191,6 +2206,10 @@ row_sel_store_mysql_rec( extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE); + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + data = btr_rec_copy_externally_stored_field(rec, templ->rec_field_no, &len, extern_field_heap); @@ -2202,9 +2221,29 @@ row_sel_store_mysql_rec( if (templ->type == DATA_BLOB) { ut_a(prebuilt->templ_contains_blob); - - /* Copy the BLOB data to the BLOB - heap of prebuilt */ + + /* A heuristic test that we can allocate the + memory for a big BLOB. We have a safety margin + of 1000000 bytes. Since the test takes some + CPU time, we do not use it for small BLOBs. */ + + if (len > 2000000 + && !ut_test_malloc(len + 1000000)) { + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: could not allocate %lu + 1000000 bytes to retrieve\n" +"InnoDB: a big column. Table name %s\n", (ulong) len, prebuilt->table->name); + + if (extern_field_heap) { + mem_heap_free( + extern_field_heap); + } + return(FALSE); + } + + /* Copy the BLOB data to the BLOB heap of + prebuilt */ if (prebuilt->blob_heap == NULL) { prebuilt->blob_heap = @@ -2222,33 +2261,46 @@ row_sel_store_mysql_rec( mysql_rec + templ->mysql_col_offset, templ->mysql_col_len, data, len, templ->type, templ->is_unsigned); - + + /* Cleanup */ if (extern_field_heap) { mem_heap_free(extern_field_heap); extern_field_heap = NULL; } + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ + mysql_rec[templ->mysql_null_byte_offset] &= + ~(byte) (templ->mysql_null_bit_mask); + } } else { /* MySQL seems to assume the field for an SQL NULL - value is set to zero. Not taking this into account - caused seg faults with NULL BLOB fields, and + value is set to zero or space. Not taking this into + account caused seg faults with NULL BLOB fields, and bug number 154 in the MySQL bug database: GROUP BY and DISTINCT could treat NULL values inequal. */ - memset(mysql_rec + templ->mysql_col_offset, '\0', - templ->mysql_col_len); - - if (!templ->mysql_null_bit_mask) { - fputs( -"InnoDB: Error: trying to return an SQL NULL field in a non-null\n" -"innoDB: column! Table name ", stderr); - ut_print_name(stderr, prebuilt->table->name); - putc('\n', stderr); + if (templ->type == DATA_VARCHAR + || templ->type == DATA_CHAR + || templ->type == DATA_BINARY + || templ->type == DATA_FIXBINARY + || templ->type == DATA_MYSQL + || templ->type == DATA_VARMYSQL) { + /* MySQL pads all non-BLOB and non-TEXT + string types with space ' ' */ + + pad_char = ' '; } else { - mysql_rec[templ->mysql_null_byte_offset] |= - (byte) (templ->mysql_null_bit_mask); + pad_char = '\0'; } + + memset(mysql_rec + templ->mysql_col_offset, pad_char, + templ->mysql_col_len); } } + + return(TRUE); } /************************************************************************* @@ -2567,9 +2619,9 @@ row_sel_push_cache_row_for_mysql( ut_ad(prebuilt->fetch_cache_first == 0); - row_sel_store_mysql_rec( + ut_a(row_sel_store_mysql_rec( prebuilt->fetch_cache[prebuilt->n_fetch_cached], - prebuilt, rec); + prebuilt, rec)); prebuilt->n_fetch_cached++; } @@ -2651,7 +2703,8 @@ row_search_for_mysql( /*=================*/ /* out: DB_SUCCESS, DB_RECORD_NOT_FOUND, - DB_END_OF_INDEX, or DB_DEADLOCK */ + DB_END_OF_INDEX, DB_DEADLOCK, + or DB_TOO_BIG_RECORD */ byte* buf, /* in/out: buffer for the fetched row in the MySQL format */ ulint mode, /* in: search mode PAGE_CUR_L, ... */ @@ -2707,7 +2760,7 @@ row_search_for_mysql( fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" "InnoDB: table handle. Magic n %lu, table name ", - prebuilt->magic_n); + (ulong) prebuilt->magic_n); ut_print_name(stderr, prebuilt->table->name); putc('\n', stderr); @@ -2716,7 +2769,17 @@ row_search_for_mysql( ut_error; } -/* fprintf(stderr, "Match mode %lu\n search tuple ", match_mode); + if (trx->n_mysql_tables_in_use == 0) { + fputs( +"InnoDB: Error: MySQL is trying to perform a SELECT\n" +"InnoDB: but it has not locked any tables in ::external_lock()!\n", + stderr); + trx_print(stderr, trx); + fputc('\n', stderr); + ut_a(0); + } + +/* fprintf(stderr, "Match mode %lu\n search tuple ", (ulong) match_mode); dtuple_print(search_tuple); fprintf(stderr, "N tables locked %lu\n", trx->mysql_n_tables_locked); @@ -2743,7 +2806,7 @@ row_search_for_mysql( /* PHASE 1: Try to pop the row from the prefetch cache */ if (direction == 0) { - trx->op_info = (char *) "starting index read"; + trx->op_info = "starting index read"; prebuilt->n_rows_fetched = 0; prebuilt->n_fetch_cached = 0; @@ -2754,7 +2817,7 @@ row_search_for_mysql( row_prebuild_sel_graph(prebuilt); } } else { - trx->op_info = (char *) "fetching rows"; + trx->op_info = "fetching rows"; if (prebuilt->n_rows_fetched == 0) { prebuilt->fetch_direction = direction; @@ -2779,7 +2842,7 @@ row_search_for_mysql( prebuilt->n_rows_fetched++; srv_n_rows_read++; - trx->op_info = (char *) ""; + trx->op_info = ""; return(DB_SUCCESS); } @@ -2791,7 +2854,7 @@ row_search_for_mysql( cache, but the cache was not full at the time of the popping: no more rows can exist in the result set */ - trx->op_info = (char *) ""; + trx->op_info = ""; return(DB_RECORD_NOT_FOUND); } @@ -2833,10 +2896,10 @@ row_search_for_mysql( retrieve also a second row if a primary key contains more than 1 column. Return immediately if this is not a HANDLER command. */ - + if (direction != 0 && !prebuilt->used_in_HANDLER) { - - trx->op_info = (char*)""; + + trx->op_info = ""; return(DB_RECORD_NOT_FOUND); } } @@ -2892,7 +2955,14 @@ row_search_for_mysql( #ifdef UNIV_SEARCH_DEBUG ut_a(0 == cmp_dtuple_rec(search_tuple, rec)); #endif - row_sel_store_mysql_rec(buf, prebuilt, rec); + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec)) { + err = DB_TOO_BIG_RECORD; + + /* We let the main loop to do the + error handling */ + goto shortcut_fails_too_big_rec; + } mtr_commit(&mtr); @@ -2910,7 +2980,7 @@ row_search_for_mysql( trx->has_search_latch = FALSE; } - trx->op_info = (char *) ""; + trx->op_info = ""; /* NOTE that we do NOT store the cursor position */ @@ -2933,14 +3003,14 @@ row_search_for_mysql( trx->has_search_latch = FALSE; } - trx->op_info = (char *) ""; + trx->op_info = ""; /* NOTE that we do NOT store the cursor position */ return(DB_RECORD_NOT_FOUND); } - +shortcut_fails_too_big_rec: mtr_commit(&mtr); mtr_start(&mtr); } @@ -3016,6 +3086,16 @@ row_search_for_mysql( if (!prebuilt->sql_stat_start) { /* No need to set an intention lock or assign a read view */ + if (trx->read_view == NULL + && prebuilt->select_lock_type == LOCK_NONE) { + + fputs( +"InnoDB: Error: MySQL is trying to perform a consistent read\n" +"InnoDB: but the read view is not assigned!\n", stderr); + trx_print(stderr, trx); + fputc('\n', stderr); + ut_a(0); + } } else if (prebuilt->select_lock_type == LOCK_NONE) { /* This is a consistent read */ /* Assign a read view for the query */ @@ -3091,12 +3171,13 @@ rec_loop: fprintf(stderr, " InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", - (ulint)(rec - buf_frame_align(rec)), next_offs, - buf_frame_get_page_no(rec)); + (ulong) (rec - buf_frame_align(rec)), + (ulong) next_offs, + (ulong) buf_frame_get_page_no(rec)); dict_index_name_print(stderr, index); fputs(". Run CHECK TABLE. You may need to\n" "InnoDB: restore from a backup, or dump + drop + reimport the table.\n", - stderr); + stderr); err = DB_CORRUPTION; @@ -3108,8 +3189,9 @@ rec_loop: fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", - (ulint)(rec - buf_frame_align(rec)), next_offs, - buf_frame_get_page_no(rec)); + (ulong) (rec - buf_frame_align(rec)), + (ulong) next_offs, + (ulong) buf_frame_get_page_no(rec)); dict_index_name_print(stderr, index); fputs(". We try to skip the rest of the page.\n", stderr); @@ -3126,8 +3208,9 @@ rec_loop: fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", - (ulint)(rec - buf_frame_align(rec)), next_offs, - buf_frame_get_page_no(rec)); + (ulong) (rec - buf_frame_align(rec)), + (ulong) next_offs, + (ulong) buf_frame_get_page_no(rec)); dict_index_name_print(stderr, index); fputs(". We try to skip the record.\n", stderr); @@ -3358,7 +3441,11 @@ rec_loop: rec_get_size(rec)); mach_write_to_4(buf, rec_get_extra_size(rec) + 4); } else { - row_sel_store_mysql_rec(buf, prebuilt, rec); + if (!row_sel_store_mysql_rec(buf, prebuilt, rec)) { + err = DB_TOO_BIG_RECORD; + + goto lock_wait_or_error; + } } if (prebuilt->clust_index_was_generated) { @@ -3463,7 +3550,7 @@ lock_wait_or_error: /* fputs("Using ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); @@ -3486,7 +3573,7 @@ normal_return: srv_n_rows_read++; } - trx->op_info = (char *) ""; + trx->op_info = ""; return(ret); } @@ -3498,11 +3585,11 @@ consistent read result, or store it to the query cache. */ ibool row_search_check_if_query_cache_permitted( /*======================================*/ - /* out: TRUE if storing or retrieving from - the query cache is permitted */ - trx_t* trx, /* in: transaction object */ - char* norm_name) /* in: concatenation of database name, '/' - char, table name */ + /* out: TRUE if storing or retrieving + from the query cache is permitted */ + trx_t* trx, /* in: transaction object */ + const char* norm_name) /* in: concatenation of database name, + '/' char, table name */ { dict_table_t* table; ibool ret = FALSE; diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c index df2cdb6359d..9dc860d70b1 100644 --- a/innobase/row/row0uins.c +++ b/innobase/row/row0uins.c @@ -241,6 +241,13 @@ row_undo_ins_parse_undo_rec( return; } + if (node->table->ibd_file_missing) { + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + clust_index = dict_table_get_first_index(node->table); ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index 1f74cfb52be..d47227166f3 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -688,6 +688,13 @@ row_undo_mod_parse_undo_rec( return; } + if (node->table->ibd_file_missing) { + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + clust_index = dict_table_get_first_index(node->table); ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c index e1e44724752..bc3cc8ea9f3 100644 --- a/innobase/row/row0undo.c +++ b/innobase/row/row0undo.c @@ -320,7 +320,8 @@ row_undo_step( if (err != DB_SUCCESS) { /* SQL error detected */ - fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", err); + fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", + (ulong) err); if (err == DB_OUT_OF_FILE_SPACE) { fprintf(stderr, diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 22f7ba60ca4..d35ae0a3e38 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -1602,7 +1602,8 @@ row_upd_clust_step( then we have to free the file segments of the index tree associated with the index */ - if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + if (node->is_delete + && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index 174214f9efe..4bc5b0dc795 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -61,7 +61,7 @@ ulint srv_fatal_semaphore_wait_threshold = 600; ibool srv_lock_timeout_and_monitor_active = FALSE; ibool srv_error_monitor_active = FALSE; -char* srv_main_thread_op_info = (char*) ""; +const char* srv_main_thread_op_info = ""; /* Server parameters which are read from the initfile */ @@ -69,8 +69,14 @@ char* srv_main_thread_op_info = (char*) ""; names, where the file name itself may also contain a path */ char* srv_data_home = NULL; +#ifdef UNIV_LOG_ARCHIVE char* srv_arch_dir = NULL; +#endif /* UNIV_LOG_ARCHIVE */ +ibool srv_file_per_table = FALSE; /* store to its own file each table + created by an user; data dictionary + tables are in the system tablespace + 0 */ ulint srv_n_data_files = 0; char** srv_data_file_names = NULL; ulint* srv_data_file_sizes = NULL; /* size in database pages */ @@ -94,7 +100,6 @@ char** srv_log_group_home_dirs = NULL; ulint srv_n_log_groups = ULINT_MAX; ulint srv_n_log_files = ULINT_MAX; ulint srv_log_file_size = ULINT_MAX; /* size in database pages */ -ibool srv_log_archive_on = TRUE; ulint srv_log_buffer_size = ULINT_MAX; /* size in database pages */ ulint srv_flush_log_at_trx_commit = 1; @@ -136,16 +141,24 @@ byte srv_latin1_ordering[256] /* The sort order table of the latin1 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF }; -ulint srv_pool_size = ULINT_MAX; /* size in database pages; - MySQL originally sets this - value in bytes */ +ulint srv_pool_size = ULINT_MAX; /* size in pages; MySQL inits + this to size in kilobytes but + we normalize this to pages in + srv_boot() */ +ulint srv_awe_window_size = 0; /* size in pages; MySQL inits + this to bytes, but we + normalize it to pages in + srv_boot() */ ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */ ulint srv_lock_table_size = ULINT_MAX; ulint srv_n_file_io_threads = ULINT_MAX; +#ifdef UNIV_LOG_ARCHIVE +ibool srv_log_archive_on = FALSE; ibool srv_archive_recovery = 0; dulint srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ ulint srv_lock_wait_timeout = 1024 * 1024 * 1024; @@ -153,6 +166,8 @@ char* srv_file_flush_method_str = NULL; ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +ulint srv_max_n_open_files = 300; + /* The InnoDB main thread tries to keep the ratio of modified pages in the buffer pool to all database pages in the buffer pool smaller than the following number. But it is not guaranteed that the value stays below @@ -210,7 +225,7 @@ struct srv_conc_slot_struct{ UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue of threads waiting to get in */ -srv_conc_slot_t* srv_conc_slots; /* array of wait +srv_conc_slot_t* srv_conc_slots; /* array of wait slots */ /* Number of times a thread is allowed to enter InnoDB within the same @@ -227,6 +242,13 @@ ibool srv_use_doublewrite_buf = TRUE; ibool srv_set_thread_priorities = TRUE; int srv_query_thread_priority = 0; + +/* TRUE if the Address Windowing Extensions of Windows are used; then we must +disable adaptive hash indexes */ +ibool srv_use_awe = FALSE; +ibool srv_use_adaptive_hash_indexes = TRUE; + + /*-------------------------------------------*/ ulint srv_n_spin_wait_rounds = 20; ulint srv_spin_wait_delay = 5; @@ -633,8 +655,8 @@ srv_suspend_thread(void) if (srv_print_thread_releases) { fprintf(stderr, "Suspending thread %lu to slot %lu meter %lu\n", - os_thread_get_curr_id(), slot_no, - srv_meter[SRV_RECOVERY]); + (ulong) os_thread_get_curr_id(), (ulong) slot_no, + (ulong) srv_meter[SRV_RECOVERY]); } slot = srv_table_get_nth_slot(slot_no); @@ -696,7 +718,8 @@ srv_release_threads( if (srv_print_thread_releases) { fprintf(stderr, "Releasing thread %lu type %lu from slot %lu meter %lu\n", - slot->id, type, i, srv_meter[SRV_RECOVERY]); + (ulong) slot->id, (ulong) type, (ulong) i, + (ulong) srv_meter[SRV_RECOVERY]); } count++; @@ -803,7 +826,7 @@ srv_init(void) UT_LIST_INIT(srv_conc_queue); srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t)); - + for (i = 0; i < OS_THREAD_MAX_N; i++) { conc_slot = srv_conc_slots + i; conc_slot->reserved = FALSE; @@ -906,11 +929,11 @@ retry: os_fast_mutex_unlock(&srv_conc_mutex); - trx->op_info = (char*)"sleeping before joining InnoDB queue"; + trx->op_info = "sleeping before joining InnoDB queue"; os_thread_sleep(50000); - trx->op_info = (char*)""; + trx->op_info = ""; os_fast_mutex_lock(&srv_conc_mutex); @@ -925,6 +948,7 @@ retry: slot = srv_conc_slots + i; if (!slot->reserved) { + break; } } @@ -962,11 +986,11 @@ retry: /* Go to wait for the event; when a thread leaves InnoDB it will release this thread */ - trx->op_info = (char*)"waiting in InnoDB queue"; + trx->op_info = "waiting in InnoDB queue"; os_event_wait(slot->event); - trx->op_info = (char*)""; + trx->op_info = ""; os_fast_mutex_lock(&srv_conc_mutex); @@ -1119,9 +1143,19 @@ srv_normalize_init_values(void) srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; - srv_pool_size = srv_pool_size / UNIV_PAGE_SIZE; + srv_pool_size = srv_pool_size / (UNIV_PAGE_SIZE / 1024); + + srv_awe_window_size = srv_awe_window_size / UNIV_PAGE_SIZE; - srv_lock_table_size = 20 * srv_pool_size; + if (srv_use_awe) { + /* If we are using AWE we must save memory in the 32-bit + address space of the process, and cannot bind the lock + table size to the real buffer pool size. */ + + srv_lock_table_size = 20 * srv_awe_window_size; + } else { + srv_lock_table_size = 5 * srv_pool_size; + } return(DB_SUCCESS); } @@ -1187,7 +1221,7 @@ srv_table_reserve_slot_for_mysql(void) " InnoDB: There appear to be %lu MySQL threads currently waiting\n" "InnoDB: inside InnoDB, which is the upper limit. Cannot continue operation.\n" "InnoDB: We intentionally generate a seg fault to print a stack trace\n" -"InnoDB: on Linux. But first we print a list of waiting threads.\n", i); +"InnoDB: on Linux. But first we print a list of waiting threads.\n", (ulong) i); for (i = 0; i < OS_THREAD_MAX_N; i++) { @@ -1195,10 +1229,10 @@ srv_table_reserve_slot_for_mysql(void) fprintf(stderr, "Slot %lu: thread id %lu, type %lu, in use %lu, susp %lu, time %lu\n", - i, os_thread_pf(slot->id), - slot->type, slot->in_use, - slot->suspended, - (ulint)difftime(ut_time(), slot->suspend_time)); + (ulong) i, (ulong) os_thread_pf(slot->id), + (ulong) slot->type, (ulong) slot->in_use, + (ulong) slot->suspended, + (ulong) difftime(ut_time(), slot->suspend_time)); } ut_error; @@ -1415,7 +1449,7 @@ srv_printf_innodb_monitor( { double time_elapsed; time_t current_time; - ulint n_reserved; + ulint n_reserved; mutex_enter(&srv_innodb_monitor_mutex); @@ -1492,36 +1526,45 @@ srv_printf_innodb_monitor( "BUFFER POOL AND MEMORY\n" "----------------------\n", file); fprintf(file, - "Total memory allocated %lu; in additional pool allocated %lu\n", + "Total memory allocated " ULINTPF + "; in additional pool allocated " ULINTPF "\n", ut_total_allocated_memory, mem_pool_get_reserved(mem_comm_pool)); + + if (srv_use_awe) { + fprintf(file, + "In addition to that %lu MB of AWE memory allocated\n", + (ulong) (srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE))); + } + buf_print_io(file); fputs("--------------\n" "ROW OPERATIONS\n" "--------------\n", file); fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n", - srv_conc_n_threads, srv_conc_n_waiting_threads); - - n_reserved = fil_space_get_n_reserved_extents(0); - if (n_reserved > 0) { - fprintf(file, - "%lu tablespace extents now reserved for B-tree split operations\n", - n_reserved); - } + (long) srv_conc_n_threads, + (ulong) srv_conc_n_waiting_threads); + n_reserved = fil_space_get_n_reserved_extents(0); + if (n_reserved > 0) { + fprintf(file, + "%lu tablespace extents now reserved for B-tree split operations\n", + (ulong) n_reserved); + } #ifdef UNIV_LINUX fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n", - srv_main_thread_process_no, - srv_main_thread_id, - srv_main_thread_op_info); + (ulong) srv_main_thread_process_no, + (ulong) srv_main_thread_id, + srv_main_thread_op_info); #else fprintf(file, "Main thread id %lu, state: %s\n", - srv_main_thread_id, + (ulong) srv_main_thread_id, srv_main_thread_op_info); #endif fprintf(file, - "Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n", + "Number of rows inserted " ULINTPF + ", updated " ULINTPF ", deleted " ULINTPF ", read " ULINTPF "\n", srv_n_rows_inserted, srv_n_rows_updated, srv_n_rows_deleted, @@ -1752,6 +1795,10 @@ srv_error_monitor_thread( os_thread_create */ { ulint cnt = 0; + dulint old_lsn; + dulint new_lsn; + + old_lsn = srv_start_lsn; #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Error monitor thread starts, id %lu\n", @@ -1762,7 +1809,24 @@ loop: cnt++; - os_thread_sleep(2000000); + /* Try to track a strange bug reported by Harald Fuchs and others, + where the lsn seems to decrease at times */ + + new_lsn = log_get_lsn(); + + if (ut_dulint_cmp(new_lsn, old_lsn) < 0) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: old log sequence number %lu %lu was greater\n" +"InnoDB: than the new log sequence number %lu %lu!\n" +"InnoDB: Please send a bug report to mysql@lists.mysql.com\n", + (ulong) ut_dulint_get_high(old_lsn), + (ulong) ut_dulint_get_low(old_lsn), + (ulong) ut_dulint_get_high(new_lsn), + (ulong) ut_dulint_get_low(new_lsn)); + } + + old_lsn = new_lsn; if (difftime(time(NULL), srv_last_monitor_time) > 60) { /* We referesh InnoDB Monitor values so that averages are @@ -1778,6 +1842,8 @@ loop: fflush(stderr); + os_thread_sleep(2000000); + if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { goto loop; @@ -1888,7 +1954,7 @@ loop: /* ---- When there is database activity by users, we cycle in this loop */ - srv_main_thread_op_info = (char*) "reserving kernel mutex"; + srv_main_thread_op_info = "reserving kernel mutex"; n_ios_very_old = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; @@ -1912,7 +1978,7 @@ loop: for (i = 0; i < 10; i++) { n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; - srv_main_thread_op_info = (char*)"sleeping"; + srv_main_thread_op_info = "sleeping"; if (!skip_sleep) { @@ -1925,12 +1991,11 @@ loop: can drop tables lazily after there no longer are SELECT queries to them. */ - srv_main_thread_op_info = - (char*)"doing background drop tables"; + srv_main_thread_op_info = "doing background drop tables"; row_drop_tables_for_mysql_in_background(); - srv_main_thread_op_info = (char*)""; + srv_main_thread_op_info = ""; if (srv_fast_shutdown && srv_shutdown_state > 0) { @@ -1941,9 +2006,12 @@ loop: is issued or the we have specified in my.cnf no flush at transaction commit */ - srv_main_thread_op_info = (char*)"flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); + srv_main_thread_op_info = "making checkpoint"; + log_free_check(); + /* If there were less than 5 i/os during the one second sleep, we assume that there is free disk i/o capacity available, and it makes sense to @@ -1954,11 +2022,10 @@ loop: n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) { - srv_main_thread_op_info = - (char*)"doing insert buffer merge"; + srv_main_thread_op_info = "doing insert buffer merge"; ibuf_contract_for_n_pages(TRUE, 5); - srv_main_thread_op_info = (char*)"flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); } @@ -2006,20 +2073,20 @@ loop: + buf_pool->n_pages_written; if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) { - srv_main_thread_op_info = (char*) "flushing buffer pool pages"; + srv_main_thread_op_info = "flushing buffer pool pages"; buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); - srv_main_thread_op_info = (char*) "flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); } /* We run a batch of insert buffer merge every 10 seconds, even if the server were active */ - srv_main_thread_op_info = (char*)"doing insert buffer merge"; + srv_main_thread_op_info = "doing insert buffer merge"; ibuf_contract_for_n_pages(TRUE, 5); - srv_main_thread_op_info = (char*)"flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); /* We run a full purge every 10 seconds, even if the server @@ -2036,20 +2103,20 @@ loop: goto background_loop; } - srv_main_thread_op_info = (char*)"purging"; + srv_main_thread_op_info = "purging"; n_pages_purged = trx_purge(); current_time = time(NULL); if (difftime(current_time, last_flush_time) > 1) { - srv_main_thread_op_info = (char*) "flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); last_flush_time = current_time; } } - srv_main_thread_op_info = (char*)"flushing buffer pool pages"; + srv_main_thread_op_info = "flushing buffer pool pages"; /* Flush a few oldest pages to make a new checkpoint younger */ @@ -2070,13 +2137,13 @@ loop: ut_dulint_max); } - srv_main_thread_op_info = (char*)"making checkpoint"; + srv_main_thread_op_info = "making checkpoint"; /* Make a new checkpoint about once in 10 seconds */ log_checkpoint(TRUE, FALSE); - srv_main_thread_op_info = (char*)"reserving kernel mutex"; + srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); @@ -2100,7 +2167,7 @@ background_loop: /* The server has been quiet for a while: start running background operations */ - srv_main_thread_op_info = (char*)"doing background drop tables"; + srv_main_thread_op_info = "doing background drop tables"; n_tables_to_drop = row_drop_tables_for_mysql_in_background(); @@ -2113,7 +2180,7 @@ background_loop: os_thread_sleep(100000); } - srv_main_thread_op_info = (char*)"purging"; + srv_main_thread_op_info = "purging"; /* Run a full purge */ @@ -2127,20 +2194,20 @@ background_loop: break; } - srv_main_thread_op_info = (char*)"purging"; + srv_main_thread_op_info = "purging"; n_pages_purged = trx_purge(); current_time = time(NULL); if (difftime(current_time, last_flush_time) > 1) { - srv_main_thread_op_info = (char*) "flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); last_flush_time = current_time; } } - srv_main_thread_op_info = (char*)"reserving kernel mutex"; + srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { @@ -2149,7 +2216,7 @@ background_loop: } mutex_exit(&kernel_mutex); - srv_main_thread_op_info = (char*)"doing insert buffer merge"; + srv_main_thread_op_info = "doing insert buffer merge"; if (srv_fast_shutdown && srv_shutdown_state > 0) { n_bytes_merged = 0; @@ -2157,7 +2224,7 @@ background_loop: n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20); } - srv_main_thread_op_info = (char*)"reserving kernel mutex"; + srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { @@ -2167,10 +2234,10 @@ background_loop: mutex_exit(&kernel_mutex); flush_loop: - srv_main_thread_op_info = (char*)"flushing buffer pool pages"; + srv_main_thread_op_info = "flushing buffer pool pages"; n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); - srv_main_thread_op_info = (char*)"reserving kernel mutex"; + srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { @@ -2179,15 +2246,14 @@ flush_loop: } mutex_exit(&kernel_mutex); - srv_main_thread_op_info = - (char*) "waiting for buffer pool flush to end"; + srv_main_thread_op_info = "waiting for buffer pool flush to end"; buf_flush_wait_batch_end(BUF_FLUSH_LIST); - srv_main_thread_op_info = (char*) "flushing log"; + srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); - srv_main_thread_op_info = (char*)"making checkpoint"; + srv_main_thread_op_info = "making checkpoint"; log_checkpoint(TRUE, FALSE); @@ -2199,7 +2265,7 @@ flush_loop: goto flush_loop; } - srv_main_thread_op_info = (char*)"reserving kernel mutex"; + srv_main_thread_op_info = "reserving kernel mutex"; mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { @@ -2207,11 +2273,12 @@ flush_loop: goto loop; } mutex_exit(&kernel_mutex); - - srv_main_thread_op_info = - (char*)"archiving log (if log archive is on)"; +/* + srv_main_thread_op_info = "archiving log (if log archive is on)"; log_archive_do(FALSE, &n_bytes_archived); +*/ + n_bytes_archived = 0; /* Keep looping in the background loop if still work to do */ @@ -2238,7 +2305,7 @@ flush_loop: master thread to wait for more server activity */ suspend_thread: - srv_main_thread_op_info = (char*)"suspending"; + srv_main_thread_op_info = "suspending"; mutex_enter(&kernel_mutex); @@ -2252,7 +2319,7 @@ suspend_thread: mutex_exit(&kernel_mutex); - srv_main_thread_op_info = (char*)"waiting for server activity"; + srv_main_thread_op_info = "waiting for server activity"; os_event_wait(event); @@ -2276,7 +2343,7 @@ suspend_thread: os_thread_exit(NULL); #ifndef __WIN__ - return(NULL); + return(NULL); /* Not reached */ #else return(0); #endif diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index 3223854652f..74dd23e4252 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -30,6 +30,7 @@ Created 2/16/1996 Heikki Tuuri #include "page0cur.h" #include "trx0trx.h" #include "dict0boot.h" +#include "dict0load.h" #include "trx0sys.h" #include "dict0crea.h" #include "btr0btr.h" @@ -55,6 +56,13 @@ Created 2/16/1996 Heikki Tuuri #include "srv0start.h" #include "que0que.h" +/* Log sequence number immediately after startup */ +dulint srv_start_lsn; +/* Log sequence number at shutdown */ +dulint srv_shutdown_lsn; + +ibool srv_start_raw_disk_in_use = FALSE; + static ibool srv_start_has_been_called = FALSE; ulint srv_sizeof_trx_t_in_ha_innodb_cc; @@ -87,13 +95,6 @@ static char* srv_monitor_file_name; #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD #define SRV_MAX_N_PENDING_SYNC_IOS 100 -/* The following limit may be too big in some old operating systems: -we may get an assertion failure in os0file.c */ - -#define SRV_MAX_N_OPEN_FILES 500 - -#define SRV_LOG_SPACE_FIRST_ID 1000000000 - /************************************************************************* Reads the data files and their sizes from a character string given in the .cnf file. */ @@ -137,7 +138,8 @@ srv_parse_data_file_paths_and_sizes( while ((*str != ':' && *str != '\0') || (*str == ':' - && (*(str + 1) == '\\' || *(str + 1) == '/'))) { + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { str++; } @@ -227,11 +229,15 @@ srv_parse_data_file_paths_and_sizes( while (*str != '\0') { path = str; - /* Note that we must ignore the ':' in a Windows path */ + /* Note that we must step over the ':' in a Windows path; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ while ((*str != ':' && *str != '\0') || (*str == ':' - && (*(str + 1) == '\\' || *(str + 1) == '/'))) { + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { str++; } @@ -420,7 +426,7 @@ io_handler_thread( os_thread_exit(NULL); #ifndef __WIN__ - return(NULL); + return(NULL); /* Not reached */ #else return(0); #endif @@ -438,7 +444,8 @@ Normalizes a directory path for Windows: converts slashes to backslashes. */ void srv_normalize_path_for_win( /*=======================*/ - char* str __attribute__((unused))) /* in/out: null-terminated character string */ + char* str __attribute__((unused))) /* in/out: null-terminated + character string */ { #ifdef __WIN__ for (; *str; str++) { @@ -489,7 +496,7 @@ srv_calc_low32( expressed in bytes */ ulint file_size) /* in: file size in database pages */ { - return(0xFFFFFFFF & (file_size << UNIV_PAGE_SIZE_SHIFT)); + return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT)); } /************************************************************************* @@ -513,6 +520,8 @@ ulint open_or_create_log_file( /*====================*/ /* out: DB_SUCCESS or error code */ + ibool create_new_db, /* in: TRUE if we should create a + new database */ ibool* log_file_created, /* out: TRUE if new log file created */ ibool log_file_has_been_opened,/* in: TRUE if a log file has been @@ -522,11 +531,12 @@ open_or_create_log_file( ulint i) /* in: log file number in group */ { ibool ret; - ulint arch_space_id; ulint size; ulint size_high; char name[10000]; + UT_NOT_USED(create_new_db); + *log_file_created = FALSE; srv_normalize_path_for_win(srv_log_group_home_dirs[k]); @@ -535,12 +545,12 @@ open_or_create_log_file( ut_a(strlen(srv_log_group_home_dirs[k]) < (sizeof name) - 10 - sizeof "ib_logfile"); - sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], "ib_logfile", i); + sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], "ib_logfile", (ulong) i); files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, OS_LOG_FILE, &ret); if (ret == FALSE) { - if (os_file_get_last_error() != OS_FILE_ALREADY_EXISTS) { + if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS) { fprintf(stderr, "InnoDB: Error in creating or opening %s\n", name); @@ -565,9 +575,9 @@ open_or_create_log_file( fprintf(stderr, "InnoDB: Error: log file %s is of different size %lu %lu bytes\n" "InnoDB: than specified in the .cnf file %lu %lu bytes!\n", - name, size_high, size, - srv_calc_high32(srv_log_file_size), - srv_calc_low32(srv_log_file_size)); + name, (ulong) size_high, (ulong) size, + (ulong) srv_calc_high32(srv_log_file_size), + (ulong) srv_calc_low32(srv_log_file_size)); return(DB_ERROR); } @@ -585,7 +595,7 @@ open_or_create_log_file( } fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n", - name, srv_log_file_size + name, (ulong) srv_log_file_size >> (20 - UNIV_PAGE_SIZE_SHIFT)); fprintf(stderr, @@ -617,24 +627,26 @@ open_or_create_log_file( ut_a(fil_validate()); fil_node_create(name, srv_log_file_size, - 2 * k + SRV_LOG_SPACE_FIRST_ID); - + 2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE); +#ifdef UNIV_LOG_ARCHIVE /* If this is the first log group, create the file space object - for archived logs */ + for archived logs. + Under MySQL, no archiving ever done. */ if (k == 0 && i == 0) { arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID; - fil_space_create((char*) "arch_log_space", arch_space_id, FIL_LOG); + fil_space_create("arch_log_space", arch_space_id, FIL_LOG); } else { arch_space_id = ULINT_UNDEFINED; } - +#endif /* UNIV_LOG_ARCHIVE */ if (i == 0) { log_group_init(k, srv_n_log_files, srv_log_file_size * UNIV_PAGE_SIZE, 2 * k + SRV_LOG_SPACE_FIRST_ID, - arch_space_id); + SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch + space id */ } return(DB_SUCCESS); @@ -649,12 +661,14 @@ open_or_create_data_files( /* out: DB_SUCCESS or error code */ ibool* create_new_db, /* out: TRUE if new database should be created */ - dulint* min_flushed_lsn,/* out: min of flushed lsn values in data - files */ +#ifdef UNIV_LOG_ARCHIVE ulint* min_arch_log_no,/* out: min of archived log numbers in data files */ - dulint* max_flushed_lsn,/* out: */ ulint* max_arch_log_no,/* out: */ +#endif /* UNIV_LOG_ARCHIVE */ + dulint* min_flushed_lsn,/* out: min of flushed lsn values in data + files */ + dulint* max_flushed_lsn,/* out: */ ulint* sum_of_new_sizes)/* out: sum of sizes of the new files added */ { ibool ret; @@ -669,7 +683,7 @@ open_or_create_data_files( if (srv_n_data_files >= 1000) { fprintf(stderr, "InnoDB: can only have < 1000 data files\n" "InnoDB: you have defined %lu\n", - srv_n_data_files); + (ulong) srv_n_data_files); return(DB_ERROR); } @@ -687,18 +701,32 @@ open_or_create_data_files( < (sizeof name) - 1); sprintf(name, "%s%s", srv_data_home, srv_data_file_names[i]); - files[i] = os_file_create(name, OS_FILE_CREATE, + if (srv_data_file_is_raw_partition[i] == 0) { + + /* First we try to create the file: if it already + exists, ret will get value FALSE */ + + files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, OS_DATA_FILE, &ret); - if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + if (ret == FALSE && os_file_get_last_error(FALSE) != + OS_FILE_ALREADY_EXISTS) { + fprintf(stderr, + "InnoDB: Error in creating or opening %s\n", + name); + + return(DB_ERROR); + } + } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { /* The partition is opened, not created; then it is written over */ + srv_start_raw_disk_in_use = TRUE; srv_created_new_raw = TRUE; files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + name, OS_FILE_OPEN_RAW, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); @@ -706,19 +734,15 @@ open_or_create_data_files( return(DB_ERROR); } } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + srv_start_raw_disk_in_use = TRUE; + ret = FALSE; + } else { + ut_a(0); } if (ret == FALSE) { - if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW - && os_file_get_last_error() != - OS_FILE_ALREADY_EXISTS) { - fprintf(stderr, - "InnoDB: Error in creating or opening %s\n", - name); - - return(DB_ERROR); - } + /* We open the data file */ if (one_created) { fprintf(stderr, @@ -729,71 +753,83 @@ open_or_create_data_files( return(DB_ERROR); } - files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + files[i] = os_file_create( + name, OS_FILE_OPEN_RAW, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + } else { + files[i] = os_file_create( + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + } + if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); - os_file_get_last_error(); + os_file_get_last_error(TRUE); return(DB_ERROR); } - if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW) { - - ret = os_file_get_size(files[i], &size, - &size_high); - ut_a(ret); - /* Round size downward to megabytes */ + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + + goto skip_size_check; + } + + ret = os_file_get_size(files[i], &size, &size_high); + ut_a(ret); + /* Round size downward to megabytes */ - rounded_size_pages = (size / (1024 * 1024) + rounded_size_pages = (size / (1024 * 1024) + 4096 * size_high) << (20 - UNIV_PAGE_SIZE_SHIFT); - if (i == srv_n_data_files - 1 + if (i == srv_n_data_files - 1 && srv_auto_extend_last_data_file) { - if (srv_data_file_sizes[i] > + if (srv_data_file_sizes[i] > rounded_size_pages || (srv_last_file_size_max > 0 && srv_last_file_size_max < rounded_size_pages)) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: auto-extending data file %s is of a different size\n" "InnoDB: %lu pages (rounded down to MB) than specified in the .cnf file:\n" "InnoDB: initial %lu pages, max %lu (relevant if non-zero) pages!\n", - name, rounded_size_pages, - srv_data_file_sizes[i], srv_last_file_size_max); + name, (ulong) rounded_size_pages, + (ulong) srv_data_file_sizes[i], + (ulong) srv_last_file_size_max); - return(DB_ERROR); - } - - srv_data_file_sizes[i] = - rounded_size_pages; + return(DB_ERROR); } + + srv_data_file_sizes[i] = rounded_size_pages; + } - if (rounded_size_pages - != srv_data_file_sizes[i]) { + if (rounded_size_pages != srv_data_file_sizes[i]) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: data file %s is of a different size\n" "InnoDB: %lu pages (rounded down to MB)\n" "InnoDB: than specified in the .cnf file %lu pages!\n", name, - rounded_size_pages, - srv_data_file_sizes[i]); + (ulong) rounded_size_pages, + (ulong) srv_data_file_sizes[i]); - return(DB_ERROR); - } + return(DB_ERROR); } - +skip_size_check: fil_read_flushed_lsn_and_arch_log_no(files[i], one_opened, - min_flushed_lsn, min_arch_log_no, - max_flushed_lsn, max_arch_log_no); +#ifdef UNIV_LOG_ARCHIVE + min_arch_log_no, max_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + min_flushed_lsn, max_flushed_lsn); one_opened = TRUE; } else { + /* We created the data file and now write it full of + zeros */ + one_created = TRUE; if (i > 0) { @@ -811,7 +847,7 @@ open_or_create_data_files( ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Setting file %s size to %lu MB\n", - name, (srv_data_file_sizes[i] + name, (ulong) (srv_data_file_sizes[i] >> (20 - UNIV_PAGE_SIZE_SHIFT))); fprintf(stderr, @@ -841,7 +877,13 @@ open_or_create_data_files( ut_a(fil_validate()); - fil_node_create(name, srv_data_file_sizes[i], 0); + if (srv_data_file_is_raw_partition[i]) { + + fil_node_create(name, srv_data_file_sizes[i], 0, TRUE); + } else { + fil_node_create(name, srv_data_file_sizes[i], 0, + FALSE); + } } ios = 0; @@ -862,23 +904,35 @@ innobase_start_or_create_for_mysql(void) /*====================================*/ /* out: DB_SUCCESS or error code */ { + buf_pool_t* ret; ibool create_new_db; ibool log_file_created; ibool log_created = FALSE; ibool log_opened = FALSE; dulint min_flushed_lsn; dulint max_flushed_lsn; +#ifdef UNIV_LOG_ARCHIVE ulint min_arch_log_no; ulint max_arch_log_no; - ibool start_archive; +#endif /* UNIV_LOG_ARCHIVE */ ulint sum_of_new_sizes; ulint sum_of_data_file_sizes; ulint tablespace_size_in_header; ulint err; ulint i; - ulint k; + ibool srv_file_per_table_original_value = srv_file_per_table; mtr_t mtr; + if (sizeof(ulint) != sizeof(void*)) { + fprintf(stderr, +"InnoDB: Error: size of InnoDB's ulint is %lu, but size of void* is %lu.\n" +"InnoDB: The sizes should be the same so that on a 64-bit platform you can\n" +"InnoDB: allocate more than 4 GB of memory.", + (ulong)sizeof(ulint), (ulong)sizeof(void*)); + } + + srv_file_per_table = FALSE; /* system tables are created in tablespace + 0 */ #ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: !!!!!!!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!!!!!!!\n"); @@ -899,12 +953,17 @@ innobase_start_or_create_for_mysql(void) "InnoDB: !!!!!!!!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!!!!!!!\n"); #endif +#ifdef UNIV_SIMULATE_AWE + fprintf(stderr, +"InnoDB: !!!!!!!!!!!!!! UNIV_SIMULATE_AWE switched on !!!!!!!!!!!!!!!!!\n"); +#endif if (srv_sizeof_trx_t_in_ha_innodb_cc != (ulint)sizeof(trx_t)) { fprintf(stderr, "InnoDB: Error: trx_t size is %lu in ha_innodb.cc but %lu in srv0start.c\n" "InnoDB: Check that pthread_mutex_t is defined in the same way in these\n" "InnoDB: compilation modules. Cannot continue.\n", - srv_sizeof_trx_t_in_ha_innodb_cc, (ulint)sizeof(trx_t)); + (ulong) srv_sizeof_trx_t_in_ha_innodb_cc, + (ulong) sizeof(trx_t)); return(DB_ERROR); } @@ -929,6 +988,17 @@ innobase_start_or_create_for_mysql(void) srv_startup_is_before_trx_rollback_phase = TRUE; os_aio_use_native_aio = FALSE; +#if !defined(__WIN2000__) && !defined(UNIV_SIMULATE_AWE) + if (srv_use_awe) { + + fprintf(stderr, +"InnoDB: Error: You have specified innodb_buffer_pool_awe_mem_mb\n" +"InnoDB: in my.cnf, but AWE can only be used in Windows 2000 and later.\n"); + + return(DB_ERROR); + } +#endif + #ifdef __WIN__ if (os_get_os_version() == OS_WIN95 || os_get_os_version() == OS_WIN31 @@ -952,28 +1022,22 @@ innobase_start_or_create_for_mysql(void) srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; #ifndef __WIN__ - } else if (0 == ut_strcmp(srv_file_flush_method_str, - (char*)"fdatasync")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "fdatasync")) { srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; - } else if (0 == ut_strcmp(srv_file_flush_method_str, - (char*)"O_DSYNC")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) { srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; - } else if (0 == ut_strcmp(srv_file_flush_method_str, - (char*)"O_DIRECT")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; - } else if (0 == ut_strcmp(srv_file_flush_method_str, - (char*)"littlesync")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; - } else if (0 == ut_strcmp(srv_file_flush_method_str, - (char*)"nosync")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) { srv_unix_file_flush_method = SRV_UNIX_NOSYNC; #else - } else if (0 == ut_strcmp(srv_file_flush_method_str, - (char*)"normal")) { + } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { srv_win_file_flush_method = SRV_WIN_IO_NORMAL; os_aio_use_native_aio = FALSE; @@ -991,6 +1055,31 @@ innobase_start_or_create_for_mysql(void) srv_file_flush_method_str); return(DB_ERROR); } + + /* Set the maximum number of threads which can wait for a semaphore + inside InnoDB */ +#if defined(__WIN__) || defined(__NETWARE__) + +/* Create less event semaphores because Win 98/ME had difficulty creating +40000 event semaphores. +Comment from Novell, Inc.: also, these just take a lot of memory on +NetWare. */ + srv_max_n_threads = 1000; +#else + if (srv_pool_size >= 8 * 1024) { + /* Here we still have srv_pool_size counted + in kilobytes, srv_boot converts the value to + pages; if buffer pool is less than 8 MB, + assume fewer threads. */ + srv_max_n_threads = 10000; + } else { + srv_max_n_threads = 1000; /* saves several MB of memory, + especially in 64-bit + computers */ + } +#endif + /* Note that the call srv_boot() also changes the values of + srv_pool_size etc. to the units used by InnoDB internally */ /* Set the maximum number of threads which can wait for a semaphore inside InnoDB */ @@ -1043,7 +1132,6 @@ NetWare. */ if (!os_aio_use_native_aio) { /* In simulated aio we currently have use only for 4 threads */ - srv_n_file_io_threads = 4; os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD @@ -1057,9 +1145,28 @@ NetWare. */ SRV_MAX_N_PENDING_SYNC_IOS); } - fil_init(SRV_MAX_N_OPEN_FILES); + fil_init(srv_max_n_open_files); + + if (srv_use_awe) { + fprintf(stderr, +"InnoDB: Using AWE: Memory window is %lu MB and AWE memory is %lu MB\n", + (ulong) (srv_awe_window_size / ((1024 * 1024) / UNIV_PAGE_SIZE)), + (ulong) (srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE))); + + /* We must disable adaptive hash indexes because they do not + tolerate remapping of pages in AWE */ + + srv_use_adaptive_hash_indexes = FALSE; + ret = buf_pool_init(srv_pool_size, srv_pool_size, + srv_awe_window_size); + } else { + ret = buf_pool_init(srv_pool_size, srv_pool_size, + srv_pool_size); + } - buf_pool_init(srv_pool_size, srv_pool_size); + if (ret == NULL) { + return(DB_ERROR); + } fsp_init(); log_init(); @@ -1070,10 +1177,10 @@ NetWare. */ for (i = 0; i < srv_n_file_io_threads; i++) { n[i] = i; - os_thread_create(io_handler_thread, n + i, thread_ids + i); } +#ifdef UNIV_LOG_ARCHIVE if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) { fprintf(stderr, "InnoDB: Error: you must set the log group home dir in my.cnf the\n" @@ -1081,9 +1188,9 @@ NetWare. */ return(DB_ERROR); } +#endif /* UNIV_LOG_ARCHIVE */ if (srv_n_log_files * srv_log_file_size >= 262144) { - fprintf(stderr, "InnoDB: Error: combined size of log files must be < 4 GB\n"); @@ -1113,8 +1220,10 @@ NetWare. */ } err = open_or_create_data_files(&create_new_db, - &min_flushed_lsn, &min_arch_log_no, - &max_flushed_lsn, &max_arch_log_no, +#ifdef UNIV_LOG_ARCHIVE + &min_arch_log_no, &max_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &min_flushed_lsn, &max_flushed_lsn, &sum_of_new_sizes); if (err != DB_SUCCESS) { fprintf(stderr, @@ -1129,41 +1238,27 @@ NetWare. */ return((int) err); } - if (!create_new_db) { - /* If we are using the doublewrite method, we will - check if there are half-written pages in data files, - and restore them from the doublewrite buffer if - possible */ - - if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { - - trx_sys_doublewrite_restore_corrupt_pages(); - } - } - +#ifdef UNIV_LOG_ARCHIVE srv_normalize_path_for_win(srv_arch_dir); srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir); +#endif /* UNIV_LOG_ARCHIVE */ + + for (i = 0; i < srv_n_log_files; i++) { + err = open_or_create_log_file(create_new_db, &log_file_created, + log_opened, 0, i); + if (err != DB_SUCCESS) { - for (k = 0; k < srv_n_log_groups; k++) { - - for (i = 0; i < srv_n_log_files; i++) { - - err = open_or_create_log_file(&log_file_created, - log_opened, k, i); - if (err != DB_SUCCESS) { - - return((int) err); - } - - if (log_file_created) { - log_created = TRUE; - } else { - log_opened = TRUE; - } + return((int) err); + } - if ((log_opened && create_new_db) + if (log_file_created) { + log_created = TRUE; + } else { + log_opened = TRUE; + } + if ((log_opened && create_new_db) || (log_opened && log_created)) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: all log files must be created at the same time.\n" "InnoDB: All log files must be created also in database creation.\n" "InnoDB: If you want bigger or smaller log files, shut down the\n" @@ -1171,16 +1266,25 @@ NetWare. */ "InnoDB: Then delete the existing log files. Edit the .cnf file\n" "InnoDB: and start the database again.\n"); - return(DB_ERROR); - } - + return(DB_ERROR); } } - if (log_created && !create_new_db && !srv_archive_recovery) { + /* Open all log files and data files in the system tablespace: we + keep them open until database shutdown */ + fil_open_log_and_system_tablespace_files(); + + if (log_created && !create_new_db +#ifdef UNIV_LOG_ARCHIVE + && !srv_archive_recovery +#endif /* UNIV_LOG_ARCHIVE */ + ) { if (ut_dulint_cmp(max_flushed_lsn, min_flushed_lsn) != 0 - || max_arch_log_no != min_arch_log_no) { +#ifdef UNIV_LOG_ARCHIVE + || max_arch_log_no != min_arch_log_no +#endif /* UNIV_LOG_ARCHIVE */ + ) { fprintf(stderr, "InnoDB: Cannot initialize created log files because\n" "InnoDB: data files were not in sync with each other\n" @@ -1203,8 +1307,14 @@ NetWare. */ mutex_enter(&(log_sys->mutex)); - recv_reset_logs(max_flushed_lsn, max_arch_log_no + 1, TRUE); - +#ifdef UNIV_LOG_ARCHIVE + /* Do not + 1 arch_log_no because we do not use log + archiving */ + recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE); +#else + recv_reset_logs(max_flushed_lsn, TRUE); +#endif /* UNIV_LOG_ARCHIVE */ + mutex_exit(&(log_sys->mutex)); } @@ -1219,10 +1329,10 @@ NetWare. */ dict_create(); srv_startup_is_before_trx_rollback_phase = FALSE; +#ifdef UNIV_LOG_ARCHIVE } else if (srv_archive_recovery) { fprintf(stderr, "InnoDB: Starting archive recovery from a backup...\n"); - err = recv_recovery_from_archive_start( min_flushed_lsn, srv_archive_recovery_limit_lsn, @@ -1231,14 +1341,11 @@ NetWare. */ return(DB_ERROR); } - /* Since ibuf init is in dict_boot, and ibuf is needed in any disk i/o, first call dict_boot */ dict_boot(); - trx_sys_init_at_db_start(); - srv_startup_is_before_trx_rollback_phase = FALSE; /* Initialize the fsp free limit global variable in the log @@ -1246,9 +1353,10 @@ NetWare. */ fsp_header_get_free_limit(0); recv_recovery_from_archive_finish(); +#endif /* UNIV_LOG_ARCHIVE */ } else { /* We always try to do a recovery, even if the database had - been shut down normally */ + been shut down normally: this is the normal startup path */ err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT, ut_dulint_max, @@ -1294,6 +1402,8 @@ NetWare. */ log_make_checkpoint_at(ut_dulint_max, TRUE); +#ifdef UNIV_LOG_ARCHIVE + /* Archiving is always off under MySQL */ if (!srv_log_archive_on) { ut_a(DB_SUCCESS == log_archive_noarchivelog()); } else { @@ -1311,6 +1421,14 @@ NetWare. */ ut_a(DB_SUCCESS == log_archive_archivelog()); } } +#endif /* UNIV_LOG_ARCHIVE */ + if (!create_new_db && srv_force_recovery == 0) { + /* After a crash recovery we only check that the info in data + dictionary is consistent with what we already know about space + id's from the call of fil_load_single_table_tablespaces(). */ + + dict_check_tablespaces_or_store_max_id(recv_needed_recovery); + } if (srv_measure_contention) { /* os_thread_create(&test_measure_cont, NULL, thread_ids + @@ -1324,17 +1442,28 @@ NetWare. */ and prints InnoDB monitor info */ os_thread_create(&srv_lock_timeout_and_monitor_thread, NULL, - thread_ids + 2 + SRV_MAX_N_IO_THREADS); + thread_ids + 2 + SRV_MAX_N_IO_THREADS); /* Create the thread which warns of long semaphore waits */ os_thread_create(&srv_error_monitor_thread, NULL, - thread_ids + 3 + SRV_MAX_N_IO_THREADS); + thread_ids + 3 + SRV_MAX_N_IO_THREADS); srv_was_started = TRUE; srv_is_being_started = FALSE; +#ifdef UNIV_DEBUG + /* Wait a while so that the created threads have time to suspend + themselves before we switch sync debugging on; otherwise a thread may + execute mutex_enter() before the checks are on, and mutex_exit() after + the checks are on, which will cause an assertion failure in sync + debug. */ + + os_thread_sleep(3000000); +#endif sync_order_checks_on = TRUE; - if (srv_use_doublewrite_buf && trx_doublewrite == NULL) { + if (srv_use_doublewrite_buf && trx_doublewrite == NULL) { + /* Create the doublewrite buffer to a new tablespace */ + trx_sys_create_doublewrite_buf(); } @@ -1344,8 +1473,8 @@ NetWare. */ return((int)DB_ERROR); } - /* Create the master thread which monitors the database - server, and does purge and other utility operations */ + /* Create the master thread which does purge and other utility + operations */ os_thread_create(&srv_master_thread, NULL, thread_ids + 1 + SRV_MAX_N_IO_THREADS); @@ -1365,7 +1494,8 @@ NetWare. */ fprintf(stderr, "InnoDB: Error: tablespace size stored in header is %lu pages, but\n" "InnoDB: the sum of data file sizes is %lu pages\n", - tablespace_size_in_header, sum_of_data_file_sizes); + (ulong) tablespace_size_in_header, + (ulong) sum_of_data_file_sizes); } if (srv_auto_extend_last_data_file @@ -1374,10 +1504,11 @@ NetWare. */ fprintf(stderr, "InnoDB: Error: tablespace size stored in header is %lu pages, but\n" "InnoDB: the sum of data file sizes is only %lu pages\n", - tablespace_size_in_header, sum_of_data_file_sizes); + (ulong) tablespace_size_in_header, + (ulong) sum_of_data_file_sizes); } - /* Check that os_fast_mutexes work as exptected */ + /* Check that os_fast_mutexes work as expected */ os_fast_mutex_init(&srv_os_test_mutex); if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) { @@ -1395,43 +1526,73 @@ NetWare. */ os_fast_mutex_free(&srv_os_test_mutex); - /***********************************************************/ - /* Do NOT merge to the 4.1 code base! */ - if (trx_sys_downgrading_from_4_1_1) { + if (srv_print_verbose_log) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Started; log sequence number %lu %lu\n", + (ulong) ut_dulint_get_high(srv_start_lsn), + (ulong) ut_dulint_get_low(srv_start_lsn)); + } + + if (srv_force_recovery > 0) { + fprintf(stderr, + "InnoDB: !!! innodb_force_recovery is set to %lu !!!\n", + (ulong) srv_force_recovery); + } + + fflush(stderr); + + if (trx_doublewrite_must_reset_space_ids) { + /* Actually, we did not change the undo log format between + 4.0 and 4.1.1, and we would not need to run purge to + completion. Note also that the purge algorithm in 4.1.1 + can process the the history list again even after a full + purge, because our algorithm does not cut the end of the + history list in all cases so that it would become empty + after a full purge. That mean that we may purge 4.0 type + undo log even after this phase. + + The insert buffer record format changed between 4.0 and + 4.1.1. It is essential that the insert buffer is emptied + here! */ + fprintf(stderr, -"InnoDB: You are downgrading from an InnoDB version which allows multiple\n" +"InnoDB: You are upgrading to an InnoDB version which allows multiple\n" "InnoDB: tablespaces. Wait that purge and insert buffer merge run to\n" "InnoDB: completion...\n"); for (;;) { - os_thread_sleep(10000000); + os_thread_sleep(1000000); if (0 == strcmp(srv_main_thread_op_info, "waiting for server activity")) { + + ut_a(ibuf_is_empty()); + break; } } fprintf(stderr, "InnoDB: Full purge and insert buffer merge completed.\n"); - trx_sys_mark_downgraded_from_4_1_1(); + trx_sys_mark_upgraded_to_multiple_tablespaces(); fprintf(stderr, -"InnoDB: Downgraded from >= 4.1.1 to 4.0\n"); +"InnoDB: You have now successfully upgraded to the multiple tablespaces\n" +"InnoDB: format. You should NOT DOWNGRADE again to an earlier version of\n" +"InnoDB: InnoDB! But if you absolutely need to downgrade, see section 4.6 of\n" +"InnoDB: http://www.innodb.com/ibman.php for instructions.\n"); } - /***********************************************************/ - if (srv_print_verbose_log) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Started\n"); - } + if (srv_force_recovery == 0) { + /* In the insert buffer we may have even bigger tablespace + id's, because we may have dropped those tablespaces, but + insert buffer merge has not had time to clean the records from + the ibuf tree. */ - if (srv_force_recovery > 0) { - fprintf(stderr, - "InnoDB: !!! innodb_force_recovery is set to %lu !!!\n", - srv_force_recovery); + ibuf_update_max_tablespace_id(); } - fflush(stderr); + srv_file_per_table = srv_file_per_table_original_value; return((int) DB_SUCCESS); } @@ -1450,17 +1611,16 @@ innobase_shutdown_for_mysql(void) if (srv_is_being_started) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Warning: shutting down a not properly started\n"); - fprintf(stderr, - " InnoDB: or created database!\n"); +" InnoDB: Warning: shutting down a not properly started\n" +" InnoDB: or created database!\n"); } return(DB_SUCCESS); } - /* 1. Flush buffer pool to disk, write the current lsn to + /* 1. Flush the buffer pool to disk, write the current lsn to the tablespace header(s), and copy all log data to archive. - The step 1 is the real InnoDB shutdown. The remaining steps + The step 1 is the real InnoDB shutdown. The remaining steps 2 - ... just free data structures after the shutdown. */ logs_empty_and_mark_files_at_shutdown(); @@ -1484,16 +1644,16 @@ innobase_shutdown_for_mysql(void) /* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM HERE OR EARLIER */ - /* 1. Let the lock timeout thread exit */ + /* a. Let the lock timeout thread exit */ os_event_set(srv_lock_timeout_thread_event); - /* 2. srv error monitor thread exits automatically, no need + /* b. srv error monitor thread exits automatically, no need to do anything here */ - /* 3. We wake the master thread so that it exits */ + /* c. We wake the master thread so that it exits */ srv_wake_master_thread(); - /* 4. Exit the i/o threads */ + /* d. Exit the i/o threads */ os_aio_wake_all_threads_at_shutdown(); @@ -1521,7 +1681,7 @@ innobase_shutdown_for_mysql(void) if (i == 1000) { fprintf(stderr, "InnoDB: Warning: %lu threads created by InnoDB had not exited at shutdown!\n", - os_thread_count); + (ulong) os_thread_count); } if (srv_monitor_file) { @@ -1555,8 +1715,8 @@ innobase_shutdown_for_mysql(void) fprintf(stderr, "InnoDB: Warning: some resources were not cleaned up in shutdown:\n" "InnoDB: threads %lu, events %lu, os_mutexes %lu, os_fast_mutexes %lu\n", - os_thread_count, os_event_count, os_mutex_count, - os_fast_mutex_count); + (ulong) os_thread_count, (ulong) os_event_count, + (ulong) os_mutex_count, (ulong) os_fast_mutex_count); } if (dict_foreign_err_file) { @@ -1568,7 +1728,10 @@ innobase_shutdown_for_mysql(void) if (srv_print_verbose_log) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Shutdown completed\n"); + fprintf(stderr, +" InnoDB: Shutdown completed; log sequence number %lu %lu\n", + (ulong) ut_dulint_get_high(srv_shutdown_lsn), + (ulong) ut_dulint_get_low(srv_shutdown_lsn)); } return((int) DB_SUCCESS); diff --git a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c index 176aedb6ae3..7cd221df6a5 100644 --- a/innobase/sync/sync0arr.c +++ b/innobase/sync/sync0arr.c @@ -53,7 +53,7 @@ struct sync_cell_struct { rw_lock_t* old_wait_rw_lock;/* the latest wait rw-lock in cell */ ulint request_type; /* lock type requested on the object */ - char* file; /* in debug version file where + const char* file; /* in debug version file where requested */ ulint line; /* in debug version line where requested */ @@ -329,7 +329,7 @@ sync_array_reserve_cell( sync_array_t* arr, /* in: wait array */ void* object, /* in: pointer to the object to wait for */ ulint type, /* in: lock request type */ - char* file, /* in: file where requested */ + const char* file, /* in: file where requested */ ulint line, /* in: line where requested */ ulint* index) /* out: index of the reserved cell */ { @@ -455,8 +455,9 @@ sync_array_cell_print( fprintf(file, "--Thread %lu has waited at %s line %lu for %.2f seconds the semaphore:\n", - os_thread_pf(cell->thread), cell->file, cell->line, - difftime(time(NULL), cell->reservation_time)); + (ulong) os_thread_pf(cell->thread), cell->file, + (ulong) cell->line, + difftime(time(NULL), cell->reservation_time)); if (type == SYNC_MUTEX) { /* We use old_wait_mutex in case the cell has already @@ -469,12 +470,12 @@ sync_array_cell_print( "Last time reserved in file %s line %lu, " #endif /* UNIV_SYNC_DEBUG */ "waiters flag %lu\n", - mutex, mutex->cfile_name, mutex->cline, - mutex->lock_word, + mutex, mutex->cfile_name, (ulong) mutex->cline, + (ulong) mutex->lock_word, #ifdef UNIV_SYNC_DEBUG - mutex->file_name, mutex->line, + mutex->file_name, (ulong) mutex->line, #endif /* UNIV_SYNC_DEBUG */ - mutex->waiters); + (ulong) mutex->waiters); } else if (type == RW_LOCK_EX || type == RW_LOCK_SHARED) { @@ -484,11 +485,12 @@ sync_array_cell_print( fprintf(file, " RW-latch at %p created in file %s line %lu\n", - rwlock, rwlock->cfile_name, rwlock->cline); + rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); if (rwlock->writer != RW_LOCK_NOT_LOCKED) { fprintf(file, "a writer (thread id %lu) has reserved it in mode %s", - os_thread_pf(rwlock->writer_thread), + (ulong) os_thread_pf(rwlock->writer_thread), rwlock->writer == RW_LOCK_EX ? " exclusive\n" : " wait exclusive\n"); @@ -498,9 +500,12 @@ sync_array_cell_print( "number of readers %lu, waiters flag %lu\n" "Last time read locked in file %s line %lu\n" "Last time write locked in file %s line %lu\n", - rwlock->reader_count, rwlock->waiters, - rwlock->last_s_file_name, rwlock->last_s_line, - rwlock->last_x_file_name, rwlock->last_x_line); + (ulong) rwlock->reader_count, + (ulong) rwlock->waiters, + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, + (ulong) rwlock->last_x_line); } else { ut_error; } @@ -647,8 +652,8 @@ sync_array_detect_deadlock( if (ret) { fprintf(stderr, "Mutex %p owned by thread %lu file %s line %lu\n", - mutex, os_thread_pf(mutex->thread_id), - mutex->file_name, mutex->line); + mutex, (ulong) os_thread_pf(mutex->thread_id), + mutex->file_name, (ulong) mutex->line); sync_array_cell_print(stderr, cell); return(TRUE); @@ -969,7 +974,7 @@ sync_array_output_info( fprintf(file, "OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n", - arr->res_count, arr->sg_count); + (long) arr->res_count, (long) arr->sg_count); i = 0; count = 0; diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index e3caa24cb1e..77757685208 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -89,7 +89,7 @@ void rw_lock_create_func( /*================*/ rw_lock_t* lock, /* in: pointer to memory */ - char* cfile_name, /* in: file name where created */ + const char* cfile_name, /* in: file name where created */ ulint cline) /* in: file line where created */ { /* If this is the very first time a synchronization @@ -119,12 +119,17 @@ rw_lock_create_func( lock->cfile_name = cfile_name; lock->cline = cline; - lock->last_s_file_name = (char *) "not yet reserved"; - lock->last_x_file_name = (char *) "not yet reserved"; + lock->last_s_file_name = "not yet reserved"; + lock->last_x_file_name = "not yet reserved"; lock->last_s_line = 0; lock->last_x_line = 0; mutex_enter(&rw_lock_list_mutex); + + if (UT_LIST_GET_LEN(rw_lock_list) > 0) { + ut_a(UT_LIST_GET_FIRST(rw_lock_list)->magic_n + == RW_LOCK_MAGIC_N); + } UT_LIST_ADD_FIRST(list, rw_lock_list, lock); @@ -141,7 +146,9 @@ rw_lock_free( /*=========*/ rw_lock_t* lock) /* in: rw-lock */ { - ut_ad(rw_lock_validate(lock)); +#ifdef UNIV_DEBUG + ut_a(rw_lock_validate(lock)); +#endif /* UNIV_DEBUG */ ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); ut_a(rw_lock_get_waiters(lock) == 0); ut_a(rw_lock_get_reader_count(lock) == 0); @@ -152,6 +159,13 @@ rw_lock_free( mutex_enter(&rw_lock_list_mutex); + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + if (UT_LIST_GET_NEXT(list, lock)) { + ut_a(UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + UT_LIST_REMOVE(list, rw_lock_list, lock); mutex_exit(&rw_lock_list_mutex); @@ -197,7 +211,7 @@ rw_lock_s_lock_spin( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name, /* in: file name where lock requested */ + const char* file_name, /* in: file name where lock requested */ ulint line) /* in: line where requested */ { ulint index; /* index of the reserved wait cell */ @@ -227,8 +241,8 @@ lock_loop: if (srv_print_latch_waits) { fprintf(stderr, "Thread %lu spin wait rw-s-lock at %p cfile %s cline %lu rnds %lu\n", - os_thread_pf(os_thread_get_curr_id()), lock, - lock->cfile_name, lock->cline, i); + (ulong) os_thread_pf(os_thread_get_curr_id()), lock, + lock->cfile_name, (ulong) lock->cline, (ulong) i); } mutex_enter(rw_lock_get_mutex(lock)); @@ -257,8 +271,8 @@ lock_loop: if (srv_print_latch_waits) { fprintf(stderr, "Thread %lu OS wait rw-s-lock at %p cfile %s cline %lu\n", - os_thread_pf(os_thread_get_curr_id()), lock, - lock->cfile_name, lock->cline); + os_thread_pf(os_thread_get_curr_id()), + lock, lock->cfile_name, (ulong) lock->cline); } rw_s_system_call_count++; @@ -308,7 +322,7 @@ rw_lock_x_lock_low( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line) /* in: line where requested */ { #ifdef UNIV_SYNC_DEBUG @@ -413,7 +427,7 @@ rw_lock_x_lock_func( rw_lock_t* lock, /* in: pointer to rw-lock */ ulint pass, /* in: pass value; != 0, if the lock will be passed to another thread to unlock */ - char* file_name,/* in: file name where lock requested */ + const char* file_name,/* in: file name where lock requested */ ulint line) /* in: line where requested */ { ulint index; /* index of the reserved wait cell */ @@ -477,7 +491,7 @@ lock_loop: fprintf(stderr, "Thread %lu spin wait rw-x-lock at %p cfile %s cline %lu rnds %lu\n", os_thread_pf(os_thread_get_curr_id()), lock, - lock->cfile_name, lock->cline, i); + lock->cfile_name, (ulong) lock->cline, (ulong) i); } rw_x_spin_wait_count++; @@ -510,7 +524,7 @@ lock_loop: fprintf(stderr, "Thread %lu OS wait for rw-x-lock at %p cfile %s cline %lu\n", os_thread_pf(os_thread_get_curr_id()), lock, - lock->cfile_name, lock->cline); + lock->cfile_name, (ulong) lock->cline); } rw_x_system_call_count++; @@ -535,7 +549,7 @@ rw_lock_debug_mutex_enter(void) { loop: if (0 == mutex_enter_nowait(&rw_lock_debug_mutex, - IB__FILE__, __LINE__)) { + __FILE__, __LINE__)) { return; } @@ -544,7 +558,7 @@ loop: rw_lock_debug_waiters = TRUE; if (0 == mutex_enter_nowait(&rw_lock_debug_mutex, - IB__FILE__, __LINE__)) { + __FILE__, __LINE__)) { return; } @@ -577,7 +591,7 @@ rw_lock_add_debug_info( rw_lock_t* lock, /* in: rw-lock */ ulint pass, /* in: pass value */ ulint lock_type, /* in: lock type */ - char* file_name, /* in: file where requested */ + const char* file_name, /* in: file where requested */ ulint line) /* in: line where requested */ { rw_lock_debug_t* info; @@ -837,7 +851,8 @@ rw_lock_debug_print( rwt = info->lock_type; fprintf(stderr, "Locked: thread %ld file %s line %ld ", - os_thread_pf(info->thread_id), info->file_name, info->line); + (ulong) os_thread_pf(info->thread_id), info->file_name, + (ulong) info->line); if (rwt == RW_LOCK_SHARED) { fputs("S-LOCK", stderr); } else if (rwt == RW_LOCK_EX) { @@ -848,7 +863,7 @@ rw_lock_debug_print( ut_error; } if (info->pass != 0) { - fprintf(stderr, " pass value %lu", info->pass); + fprintf(stderr, " pass value %lu", (ulong) info->pass); } putc('\n', stderr); } diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 5ee08d83987..86306e49cac 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -201,7 +201,7 @@ void mutex_create_func( /*==============*/ mutex_t* mutex, /* in: pointer to memory */ - char* cfile_name, /* in: file name where created */ + const char* cfile_name, /* in: file name where created */ ulint cline) /* in: file line where created */ { #if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) @@ -214,7 +214,7 @@ mutex_create_func( mutex->magic_n = MUTEX_MAGIC_N; #ifdef UNIV_SYNC_DEBUG mutex->line = 0; - mutex->file_name = (char *) "not yet reserved"; + mutex->file_name = "not yet reserved"; #endif /* UNIV_SYNC_DEBUG */ mutex->level = SYNC_LEVEL_NONE; mutex->cfile_name = cfile_name; @@ -232,6 +232,10 @@ mutex_create_func( mutex_enter(&mutex_list_mutex); + if (UT_LIST_GET_LEN(mutex_list) > 0) { + ut_a(UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N); + } + UT_LIST_ADD_FIRST(list, mutex_list, mutex); mutex_exit(&mutex_list_mutex); @@ -247,7 +251,9 @@ mutex_free( /*=======*/ mutex_t* mutex) /* in: mutex */ { - ut_ad(mutex_validate(mutex)); +#ifdef UNIV_DEBUG + ut_a(mutex_validate(mutex)); +#endif /* UNIV_DEBUG */ ut_a(mutex_get_lock_word(mutex) == 0); ut_a(mutex_get_waiters(mutex) == 0); @@ -255,6 +261,15 @@ mutex_free( mutex_enter(&mutex_list_mutex); + if (UT_LIST_GET_PREV(list, mutex)) { + ut_a(UT_LIST_GET_PREV(list, mutex)->magic_n + == MUTEX_MAGIC_N); + } + if (UT_LIST_GET_NEXT(list, mutex)) { + ut_a(UT_LIST_GET_NEXT(list, mutex)->magic_n + == MUTEX_MAGIC_N); + } + UT_LIST_REMOVE(list, mutex_list, mutex); mutex_exit(&mutex_list_mutex); @@ -279,10 +294,10 @@ mutex_enter_nowait( /*===============*/ /* out: 0 if succeed, 1 if not */ mutex_t* mutex, /* in: pointer to mutex */ - char* file_name __attribute__((unused)), + const char* file_name __attribute__((unused)), /* in: file name where mutex requested */ - ulint line __attribute__((unused))) + ulint line __attribute__((unused))) /* in: line where requested */ { ut_ad(mutex_validate(mutex)); @@ -340,9 +355,10 @@ for the mutex before suspending the thread. */ void mutex_spin_wait( /*============*/ - mutex_t* mutex, /* in: pointer to mutex */ - char* file_name, /* in: file name where mutex requested */ - ulint line) /* in: line where requested */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where + mutex requested */ + ulint line) /* in: line where requested */ { ulint index; /* index of the reserved wait cell */ ulint i; /* spin round count */ @@ -378,8 +394,8 @@ spin_loop: if (srv_print_latch_waits) { fprintf(stderr, "Thread %lu spin wait mutex at %p cfile %s cline %lu rnds %lu\n", - os_thread_pf(os_thread_get_curr_id()), mutex, - mutex->cfile_name, mutex->cline, i); + (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); } mutex_spin_round_count += i; @@ -439,7 +455,8 @@ spin_loop: fprintf(stderr, "Thread %lu spin wait succeeds at 2:" " mutex at %p\n", - os_thread_pf(os_thread_get_curr_id()), mutex); + (ulong) os_thread_pf(os_thread_get_curr_id()), + mutex); } return; @@ -457,8 +474,8 @@ spin_loop: if (srv_print_latch_waits) { fprintf(stderr, "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", - os_thread_pf(os_thread_get_curr_id()), mutex, - mutex->cfile_name, mutex->cline, i); + (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); } mutex_system_call_count++; @@ -493,7 +510,7 @@ void mutex_set_debug_info( /*=================*/ mutex_t* mutex, /* in: mutex */ - char* file_name, /* in: file where requested */ + const char* file_name, /* in: file where requested */ ulint line) /* in: line where requested */ { ut_ad(mutex); @@ -760,13 +777,13 @@ sync_thread_levels_g( fprintf(stderr, "InnoDB error: sync levels should be > %lu but a level is %lu\n", - limit, slot->level); + (ulong) limit, (ulong) slot->level); if (mutex->magic_n == MUTEX_MAGIC_N) { fprintf(stderr, "Mutex created at %s %lu\n", mutex->cfile_name, - mutex->cline); + (ulong) mutex->cline); if (mutex_get_lock_word(mutex) != 0) { #ifdef UNIV_SYNC_DEBUG @@ -779,7 +796,7 @@ sync_thread_levels_g( fprintf(stderr, "InnoDB: Locked mutex: addr %p thread %ld file %s line %ld\n", - mutex, os_thread_pf(thread_id), file_name, line); + mutex, os_thread_pf(thread_id), file_name, (ulong) line); #else /* UNIV_SYNC_DEBUG */ fprintf(stderr, "InnoDB: Locked mutex: addr %p\n", mutex); @@ -956,7 +973,7 @@ sync_thread_add_level( } array = thread_slot->levels; - + /* NOTE that there is a problem with _NODE and _LEAF levels: if the B-tree height changes, then a leaf can change to an internal node or the other way around. We do not know at present if this can cause @@ -1239,10 +1256,13 @@ sync_print_wait_info( fprintf(file, "Mutex spin waits %lu, rounds %lu, OS waits %lu\n" "RW-shared spins %lu, OS waits %lu; RW-excl spins %lu, OS waits %lu\n", - mutex_spin_wait_count, mutex_spin_round_count, - mutex_os_wait_count, - rw_s_spin_wait_count, rw_s_os_wait_count, - rw_x_spin_wait_count, rw_x_os_wait_count); + (ulong) mutex_spin_wait_count, + (ulong) mutex_spin_round_count, + (ulong) mutex_os_wait_count, + (ulong) rw_s_spin_wait_count, + (ulong) rw_s_os_wait_count, + (ulong) rw_x_spin_wait_count, + (ulong) rw_x_os_wait_count); } /*********************************************************************** diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index a8b6b9fcc21..3d5f0d3f03a 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -277,7 +277,7 @@ trx_purge_add_update_undo_to_history( if (undo->id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, - "InnoDB: Error: undo->id is %lu\n", undo->id); + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); ut_error; } @@ -920,7 +920,7 @@ trx_purge_fetch_next_rec( if (srv_print_thread_releases) { fprintf(stderr, "Purge: No logs left in the history list; pages handled %lu\n", - purge_sys->n_pages_handled); + (ulong) purge_sys->n_pages_handled); } mutex_exit(&(purge_sys->mutex)); @@ -1072,7 +1072,8 @@ trx_purge(void) if (srv_print_thread_releases) { fprintf(stderr, - "Purge ends; pages handled %lu\n", purge_sys->n_pages_handled); + "Purge ends; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); } return(purge_sys->n_pages_handled - old_pages_handled); @@ -1089,16 +1090,16 @@ trx_purge_sys_print(void) read_view_print(purge_sys->view); fprintf(stderr, "InnoDB: Purge trx n:o %lu %lu, undo n_o %lu %lu\n", - ut_dulint_get_high(purge_sys->purge_trx_no), - ut_dulint_get_low(purge_sys->purge_trx_no), - ut_dulint_get_high(purge_sys->purge_undo_no), - ut_dulint_get_low(purge_sys->purge_undo_no)); + (ulong) ut_dulint_get_high(purge_sys->purge_trx_no), + (ulong) ut_dulint_get_low(purge_sys->purge_trx_no), + (ulong) ut_dulint_get_high(purge_sys->purge_undo_no), + (ulong) ut_dulint_get_low(purge_sys->purge_undo_no)); fprintf(stderr, "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n" "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n", - purge_sys->next_stored, - purge_sys->page_no, - purge_sys->offset, - purge_sys->hdr_page_no, - purge_sys->hdr_offset); + (ulong) purge_sys->next_stored, + (ulong) purge_sys->page_no, + (ulong) purge_sys->offset, + (ulong) purge_sys->hdr_page_no, + (ulong) purge_sys->hdr_offset); } diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index 79fad312e8e..382f723a05c 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -823,17 +823,17 @@ trx_undo_update_rec_get_update( if (field_no >= dict_index_get_n_fields(index)) { fprintf(stderr, -"InnoDB: Error: trying to access update undo rec field %lu in ", field_no); +"InnoDB: Error: trying to access update undo rec field %lu in ", (ulong) field_no); dict_index_name_print(stderr, index); fprintf(stderr, "\n" "InnoDB: but index has only %lu fields\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n" "InnoDB: Run also CHECK TABLE ", - dict_index_get_n_fields(index)); + (ulong) dict_index_get_n_fields(index)); ut_print_name(stderr, index->table_name); fprintf(stderr, "\n" "InnoDB: n_fields = %lu, i = %lu, ptr %p\n", - n_fields, i, ptr); + (ulong) n_fields, (ulong) i, ptr); return(NULL); } @@ -1066,7 +1066,7 @@ trx_undo_report_row_operation( undo_page = buf_page_get_gen(undo->space, page_no, RW_X_LATCH, undo->guess_page, BUF_GET, - IB__FILE__, __LINE__, + __FILE__, __LINE__, &mtr); #ifdef UNIV_SYNC_DEBUG @@ -1335,13 +1335,12 @@ trx_undo_prev_version_build( "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n" "InnoDB: undo rec table id %lu %lu, index table id %lu %lu\n" "InnoDB: dump of 150 bytes in undo rec: ", - dict_index_get_n_unique(index), - undo_rec, type, cmpl_info, - ut_dulint_get_high(table_id), - ut_dulint_get_low(table_id), - ut_dulint_get_high(index->table->id), - ut_dulint_get_low(index->table->id)); - + (ulong) dict_index_get_n_unique(index), + undo_rec, (ulong) type, (ulong) cmpl_info, + (ulong) ut_dulint_get_high(table_id), + (ulong) ut_dulint_get_low(table_id), + (ulong) ut_dulint_get_high(index->table->id), + (ulong) ut_dulint_get_low(index->table->id)); ut_print_buf(stderr, undo_rec, 150); fputs("\n" "InnoDB: index record ", stderr); @@ -1352,14 +1351,14 @@ trx_undo_prev_version_build( fprintf(stderr, "\n" "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n" "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n", - ut_dulint_get_high(rec_trx_id), - ut_dulint_get_low(rec_trx_id), - ut_dulint_get_high(trx_id), - ut_dulint_get_low(trx_id), - ut_dulint_get_high(old_roll_ptr), - ut_dulint_get_low(old_roll_ptr), - ut_dulint_get_high(roll_ptr), - ut_dulint_get_low(roll_ptr)); + (ulong) ut_dulint_get_high(rec_trx_id), + (ulong) ut_dulint_get_low(rec_trx_id), + (ulong) ut_dulint_get_high(trx_id), + (ulong) ut_dulint_get_low(trx_id), + (ulong) ut_dulint_get_high(old_roll_ptr), + (ulong) ut_dulint_get_low(old_roll_ptr), + (ulong) ut_dulint_get_high(roll_ptr), + (ulong) ut_dulint_get_low(roll_ptr)); trx_purge_sys_print(); diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index eed5e79a20f..e65755a0f73 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -117,11 +117,11 @@ trx_rollback_for_mysql( return(DB_SUCCESS); } - trx->op_info = (char *) "rollback"; + trx->op_info = "rollback"; err = trx_general_rollback_for_mysql(trx, FALSE, NULL); - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } @@ -142,14 +142,14 @@ trx_rollback_last_sql_stat_for_mysql( return(DB_SUCCESS); } - trx->op_info = (char *) "rollback of SQL statement"; + trx->op_info = "rollback of SQL statement"; err = trx_general_rollback_for_mysql(trx, TRUE, &(trx->last_sql_stat_start)); /* The following call should not be needed, but we play safe: */ trx_mark_sql_stat_end(trx); - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } @@ -200,7 +200,7 @@ trx_rollback_to_savepoint_for_mysql( DB_NO_SAVEPOINT, otherwise DB_SUCCESS */ trx_t* trx, /* in: transaction handle */ - char* savepoint_name, /* in: savepoint name */ + const char* savepoint_name, /* in: savepoint name */ ib_longlong* mysql_binlog_cache_pos) /* out: the MySQL binlog cache position corresponding to this savepoint; MySQL needs this @@ -240,7 +240,7 @@ trx_rollback_to_savepoint_for_mysql( *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; - trx->op_info = (char *) "rollback to a savepoint"; + trx->op_info = "rollback to a savepoint"; err = trx_general_rollback_for_mysql(trx, TRUE, &(savep->savept)); @@ -249,7 +249,7 @@ trx_rollback_to_savepoint_for_mysql( trx_mark_sql_stat_end(trx); - trx->op_info = (char *) ""; + trx->op_info = ""; return(err); } @@ -265,7 +265,7 @@ trx_savepoint_for_mysql( /*====================*/ /* out: always DB_SUCCESS */ trx_t* trx, /* in: transaction handle */ - char* savepoint_name, /* in: savepoint name */ + const char* savepoint_name, /* in: savepoint name */ ib_longlong binlog_cache_pos) /* in: MySQL binlog cache position corresponding to this connection at the time of the @@ -344,7 +344,7 @@ trx_rollback_or_clean_all_without_sess(void) trx_t* trx; dict_table_t* table; ib_longlong rows_to_undo; - char* unit = (char*)""; + const char* unit = ""; int err; mutex_enter(&kernel_mutex); @@ -391,8 +391,8 @@ loop: if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) { fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n", - ut_dulint_get_high(trx->id), - ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); trx_cleanup_at_db_startup(trx); @@ -422,14 +422,14 @@ loop: rows_to_undo = trx_roll_max_undo_no; if (rows_to_undo > 1000000000) { rows_to_undo = rows_to_undo / 1000000; - unit = (char*)"M"; + unit = "M"; } fprintf(stderr, "InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo", - ut_dulint_get_high(trx->id), - ut_dulint_get_low(trx->id), - (ulint)rows_to_undo, unit); + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id), + (ulong) rows_to_undo, unit); mutex_exit(&kernel_mutex); if (trx->dict_operation) { @@ -446,7 +446,7 @@ loop: fprintf(stderr, "InnoDB: Waiting for rollback of trx id %lu to end\n", - ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_low(trx->id)); os_thread_sleep(100000); mutex_enter(&kernel_mutex); @@ -460,8 +460,8 @@ loop: fprintf(stderr, "InnoDB: Dropping table with id %lu %lu in recovery if it exists\n", - ut_dulint_get_high(trx->table_id), - ut_dulint_get_low(trx->table_id)); + (ulong) ut_dulint_get_high(trx->table_id), + (ulong) ut_dulint_get_low(trx->table_id)); table = dict_table_get_on_id_low(trx->table_id, trx); @@ -481,8 +481,8 @@ loop: } fprintf(stderr, "\nInnoDB: Rolling back of trx id %lu %lu completed\n", - ut_dulint_get_high(trx->id), - ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); mem_heap_free(heap); goto loop; @@ -855,10 +855,10 @@ try_again: if (progress_pct != trx_roll_progress_printed_pct) { if (trx_roll_progress_printed_pct == 0) { fprintf(stderr, - "\nInnoDB: Progress in percents: %lu", progress_pct); + "\nInnoDB: Progress in percents: %lu", (ulong) progress_pct); } else { fprintf(stderr, - " %lu", progress_pct); + " %lu", (ulong) progress_pct); } fflush(stderr); trx_roll_progress_printed_pct = progress_pct; @@ -1142,7 +1142,7 @@ trx_finish_rollback_off_kernel( if (lock_print_waits) { fprintf(stderr, "Trx %lu rollback finished\n", - ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_low(trx->id)); } trx_commit_off_kernel(trx); diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 3601599358f..54bd5be01a1 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -26,6 +26,17 @@ Created 3/26/1996 Heikki Tuuri trx_sys_t* trx_sys = NULL; trx_doublewrite_t* trx_doublewrite = NULL; +/* The following is set to TRUE when we are upgrading from the old format data +files to the new >= 4.1.x format multiple tablespaces format data files */ + +ibool trx_doublewrite_must_reset_space_ids = FALSE; + +/* The following is TRUE when we are using the database in the new format, +i.e., we have successfully upgraded, or have created a new database +installation */ + +ibool trx_sys_multiple_tablespace_format = FALSE; + /* In a MySQL replication slave, in crash recovery we store the master log file name and position here. We have successfully got the updates to InnoDB up to this position. If .._pos is -1, it means no crash recovery was needed, @@ -34,44 +45,14 @@ or there was no master log position info inside InnoDB. */ char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; ib_longlong trx_sys_mysql_master_log_pos = -1; -/* Do NOT merge this to the 4.1 code base! */ -ibool trx_sys_downgrading_from_4_1_1 = FALSE; - -/******************************************************************** -Do NOT merge this to the 4.1 code base! -Marks the trx sys header when we have successfully downgraded from the >= 4.1.1 -multiple tablespace format back to the 4.0 format. */ - -void -trx_sys_mark_downgraded_from_4_1_1(void) -/*====================================*/ -{ - page_t* page; - byte* doublewrite; - mtr_t mtr; +/* If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. If .._pos is -1, it means there was no binlog position info inside +InnoDB. */ - /* Let us mark to the trx_sys header that the downgrade has been - done. */ - - mtr_start(&mtr); - - page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); -#ifdef UNIV_SYNC_DEBUG - buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); -#endif /* UNIV_SYNC_DEBUG */ - - doublewrite = page + TRX_SYS_DOUBLEWRITE; - - mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, - TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N + 1, - MLOG_4BYTES, &mtr); - mtr_commit(&mtr); - - /* Flush the modified pages to disk and make a checkpoint */ - log_make_checkpoint_at(ut_dulint_max, TRUE); +char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +ib_longlong trx_sys_mysql_bin_log_pos = -1; - trx_sys_downgrading_from_4_1_1 = FALSE; -} /******************************************************************** Determines if a page number is located inside the doublewrite buffer. */ @@ -114,11 +95,11 @@ trx_doublewrite_init( { trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); - /* When we have the doublewrite buffer in use, we do not need to - call os_file_flush (Unix fsync) after every write. */ - + /* Since we now start to use the doublewrite buffer, no need to call + fsync() after every write to a data file */ + os_do_not_call_flush_at_each_write = TRUE; - + mutex_create(&(trx_doublewrite->mutex)); mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE); @@ -144,7 +125,43 @@ trx_doublewrite_init( } /******************************************************************** -Creates the doublewrite buffer at a database start. The header of the +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ + +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void) +/*===============================================*/ +{ + page_t* page; + byte* doublewrite; + mtr_t mtr; + + /* We upgraded to 4.1.x and reset the space id fields in the + doublewrite buffer. Let us mark to the trx_sys header that the upgrade + has been done. */ + + mtr_start(&mtr); + + page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + doublewrite = page + TRX_SYS_DOUBLEWRITE; + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(ut_dulint_max, TRUE); + + trx_sys_multiple_tablespace_format = TRUE; +} + +/******************************************************************** +Creates the doublewrite buffer to a new InnoDB installation. The header of the doublewrite buffer is placed on the trx system header page. */ void @@ -179,31 +196,6 @@ start_again: if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) == TRX_SYS_DOUBLEWRITE_MAGIC_N) { - /* Do NOT merge to the 4.1 code base! */ - if (mach_read_from_4(doublewrite - + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) - == TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { - - fprintf(stderr, -"InnoDB: You are downgrading from the multiple tablespace format of\n" -"InnoDB: >= MySQL-4.1.1 back to the old format of MySQL-4.0.\n" -"InnoDB:\n" -"InnoDB: MAKE SURE that the mysqld server is idle, and purge and the insert\n" -"InnoDB: buffer merge have run to completion under >= 4.1.1 before trying to\n" -"InnoDB: downgrade! You can determine this by looking at SHOW INNODB STATUS:\n" -"InnoDB: if the Main thread is 'waiting for server activity' and SHOW\n" -"InnoDB: PROCESSLIST shows that you have ended all other connections\n" -"InnoDB: to mysqld, then purge and the insert buffer merge have been\n" -"InnoDB: completed.\n" -"InnoDB: If you have already created tables in >= 4.1.1, then those\n" -"InnoDB: tables cannot be used under 4.0.\n" -"InnoDB: NOTE THAT this downgrade procedure has not been properly tested!\n" -"InnoDB: The safe way to downgrade is to dump all InnoDB tables and recreate\n" -"InnoDB: the whole tablespace.\n"); - - trx_sys_downgrading_from_4_1_1 = TRUE; - } - /* The doublewrite buffer has already been created: just read in some numbers */ @@ -313,10 +305,15 @@ start_again: } mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, - TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE_REPEAT, - TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); mtr_commit(&mtr); /* Flush the modified pages to disk and make a checkpoint */ @@ -324,23 +321,31 @@ start_again: fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); + trx_sys_multiple_tablespace_format = TRUE; + goto start_again; } } /******************************************************************** -At a database startup uses a possible doublewrite buffer to restore +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore half-written pages in the data files. */ void -trx_sys_doublewrite_restore_corrupt_pages(void) -/*===========================================*/ +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages) { byte* buf; byte* read_buf; byte* unaligned_read_buf; ulint block1; ulint block2; + ulint source_page_no; byte* page; byte* doublewrite; ulint space_id; @@ -352,43 +357,17 @@ trx_sys_doublewrite_restore_corrupt_pages(void) unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); - /* Read the trx sys header to check if we are using the - doublewrite buffer */ + /* Read the trx sys header to check if we are using the doublewrite + buffer */ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0, UNIV_PAGE_SIZE, read_buf, NULL); - doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) == TRX_SYS_DOUBLEWRITE_MAGIC_N) { /* The doublewrite buffer has been created */ - /* Do NOT merge to the 4.1 code base! */ - if (mach_read_from_4(doublewrite - + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) - == TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { - - fprintf(stderr, -"InnoDB: You are downgrading from the multiple tablespace format of\n" -"InnoDB: >= MySQL-4.1.1 back to the old format of MySQL-4.0.\n" -"InnoDB:\n" -"InnoDB: MAKE SURE that the mysqld server is idle, and purge and the insert\n" -"InnoDB: buffer merge have run to completion under >= 4.1.1 before trying to\n" -"InnoDB: downgrade! You can determine this by looking at SHOW INNODB STATUS:\n" -"InnoDB: if the Main thread is 'waiting for server activity' and SHOW\n" -"InnoDB: PROCESSLIST shows that you have ended all other connections\n" -"InnoDB: to mysqld, then purge and the insert buffer merge have been\n" -"InnoDB: completed.\n" -"InnoDB: If you have already created tables in >= 4.1.1, then those\n" -"InnoDB: tables cannot be used under 4.0.\n" -"InnoDB: NOTE THAT this downgrade procedure has not been properly tested!\n" -"InnoDB: The safe way to downgrade is to dump all InnoDB tables and recreate\n" -"InnoDB: the whole tablespace.\n"); - - trx_sys_downgrading_from_4_1_1 = TRUE; - } - trx_doublewrite_init(doublewrite); block1 = trx_doublewrite->block1; @@ -399,6 +378,23 @@ trx_sys_doublewrite_restore_corrupt_pages(void) goto leave_func; } + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + trx_doublewrite_must_reset_space_ids = TRUE; + + fprintf(stderr, +"InnoDB: Resetting space id's in the doublewrite buffer\n"); + } else { + trx_sys_multiple_tablespace_format = TRUE; + } + /* Read the pages from the doublewrite buffer to memory */ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0, @@ -416,13 +412,46 @@ trx_sys_doublewrite_restore_corrupt_pages(void) for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); - space_id = 0; - if (!fil_check_adress_in_tablespace(space_id, page_no)) { + if (trx_doublewrite_must_reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + fil_io(OS_FILE_WRITE, TRUE, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + /* printf("Resetting space id in page %lu\n", + source_page_no); */ + } else { + space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + if (!restore_corrupt_pages) { + /* The database was shut down gracefully: no need to + restore pages */ + + } else if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { fprintf(stderr, - "InnoDB: Warning: an inconsistent page in the doublewrite buffer\n" - "InnoDB: space id %lu page number %lu, %lu'th page in dblwr buf.\n", - space_id, page_no, i); +"InnoDB: Warning: a page in the doublewrite buffer is not within space\n" +"InnoDB: bounds; space id %lu page number %lu, page %lu in doublewrite buf.\n", + (ulong) space_id, (ulong) page_no, (ulong) i); } else if (space_id == TRX_SYS_SPACE && ( (page_no >= block1 @@ -445,7 +474,7 @@ trx_sys_doublewrite_restore_corrupt_pages(void) fprintf(stderr, "InnoDB: Warning: database page corruption or a failed\n" - "InnoDB: file read of page %lu.\n", page_no); + "InnoDB: file read of page %lu.\n", (ulong) page_no); fprintf(stderr, "InnoDB: Trying to recover it from the doublewrite buffer.\n"); @@ -549,7 +578,7 @@ replication has proceeded. */ void trx_sys_update_mysql_binlog_offset( /*===============================*/ - char* file_name,/* in: MySQL log file name */ + const char* file_name,/* in: MySQL log file name */ ib_longlong offset, /* in: position in that log file */ ulint field, /* in: offset of the MySQL log info field in the trx sys header */ @@ -578,8 +607,8 @@ trx_sys_update_mysql_binlog_offset( if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), file_name)) { - mlog_write_string((byte*) (sys_header + field - + TRX_SYS_MYSQL_LOG_NAME), + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, (byte*) file_name, 1 + ut_strlen(file_name), mtr); } @@ -595,8 +624,8 @@ trx_sys_update_mysql_binlog_offset( mlog_write_ulint(sys_header + field + TRX_SYS_MYSQL_LOG_OFFSET_LOW, - (ulint)(offset & 0xFFFFFFFF), - MLOG_4BYTES, mtr); + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); } /********************************************************************* @@ -619,17 +648,17 @@ trx_sys_print_mysql_binlog_offset_from_page( fprintf(stderr, "ibbackup: Last MySQL binlog file position %lu %lu, file name %s\n", - mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), - mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET_LOW), sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME); } } /********************************************************************* -Prints to stderr the MySQL binlog offset info in the trx system header if -the magic number shows it valid. */ +Stores the MySQL binlog offset info in the trx system header if +the magic number shows it valid, and print the info to stderr */ void trx_sys_print_mysql_binlog_offset(void) @@ -637,7 +666,8 @@ trx_sys_print_mysql_binlog_offset(void) { trx_sysf_t* sys_header; mtr_t mtr; - + ulong trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low; + mtr_start(&mtr); sys_header = trx_sysf_get(&mtr); @@ -651,14 +681,22 @@ trx_sys_print_mysql_binlog_offset(void) return; } - fprintf(stderr, - "InnoDB: Last MySQL binlog file position %lu %lu, file name %s\n", - mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), - mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO - + TRX_SYS_MYSQL_LOG_OFFSET_LOW), - sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME); + trx_sys_mysql_bin_log_pos_high = mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); + trx_sys_mysql_bin_log_pos_low = mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + + trx_sys_mysql_bin_log_pos = (((ib_longlong)trx_sys_mysql_bin_log_pos_high) << 32) + + (ib_longlong)trx_sys_mysql_bin_log_pos_low; + ut_memcpy(trx_sys_mysql_bin_log_name, sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); + + fprintf(stderr, + "InnoDB: Last MySQL binlog file position %lu %lu, file name %s\n", + trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, + trx_sys_mysql_bin_log_name); + mtr_commit(&mtr); } @@ -689,9 +727,9 @@ trx_sys_print_mysql_master_log_pos(void) fprintf(stderr, "InnoDB: In a MySQL replication slave the last master binlog file\n" "InnoDB: position %lu %lu, file name %s\n", - mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), - mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET_LOW), sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME); @@ -810,7 +848,7 @@ trx_sys_init_at_db_start(void) { trx_sysf_t* sys_header; ib_longlong rows_to_undo = 0; - char* unit = (char*)""; + const char* unit = ""; trx_t* trx; mtr_t mtr; @@ -859,19 +897,19 @@ trx_sys_init_at_db_start(void) } if (rows_to_undo > 1000000000) { - unit = (char*)"M"; + unit = "M"; rows_to_undo = rows_to_undo / 1000000; } fprintf(stderr, "InnoDB: %lu transaction(s) which must be rolled back or cleaned up\n" "InnoDB: in total %lu%s row operations to undo\n", - UT_LIST_GET_LEN(trx_sys->trx_list), - (ulint)rows_to_undo, unit); + (ulong) UT_LIST_GET_LEN(trx_sys->trx_list), + (ulong) rows_to_undo, unit); fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n", - ut_dulint_get_high(trx_sys->max_trx_id), - ut_dulint_get_low(trx_sys->max_trx_id)); + (ulong) ut_dulint_get_high(trx_sys->max_trx_id), + (ulong) ut_dulint_get_low(trx_sys->max_trx_id)); } UT_LIST_INIT(trx_sys->view_list); diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 576827966ab..f7497ac4090 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -81,7 +81,7 @@ trx_create( trx->magic_n = TRX_MAGIC_N; - trx->op_info = (char *) ""; + trx->op_info = ""; trx->type = TRX_USER; trx->conc_state = TRX_NOT_STARTED; @@ -107,7 +107,7 @@ trx_create( trx->mysql_log_file_name = NULL; trx->mysql_log_offset = 0; - trx->mysql_master_log_file_name = (char*) ""; + trx->mysql_master_log_file_name = ""; trx->mysql_master_log_pos = 0; mutex_create(&(trx->undo_mutex)); @@ -701,11 +701,13 @@ trx_commit_off_kernel( TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); } - /* If we did not take the shortcut, the following call - commits the mini-transaction, making the whole transaction - committed in the file-based world at this log sequence number; - otherwise, we get the commit lsn from the call of - trx_undo_update_cleanup_by_discard above. + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + NOTE that transaction numbers, which are assigned only to transactions with an update undo log, do not necessarily come in exactly the same order as commit lsn's, if the transactions @@ -1485,7 +1487,7 @@ trx_commit_for_mysql( ut_a(trx); - trx->op_info = (char *) "committing"; + trx->op_info = "committing"; trx_start_if_not_started(trx); @@ -1495,7 +1497,7 @@ trx_commit_for_mysql( mutex_exit(&kernel_mutex); - trx->op_info = (char *) ""; + trx->op_info = ""; return(0); } @@ -1514,7 +1516,7 @@ trx_commit_complete_for_mysql( ut_a(trx); - trx->op_info = (char*)"flushing log"; + trx->op_info = "flushing log"; if (srv_flush_log_at_trx_commit == 0) { /* Do nothing */ @@ -1538,7 +1540,7 @@ trx_commit_complete_for_mysql( ut_error; } - trx->op_info = (char*)""; + trx->op_info = ""; return(0); } @@ -1575,8 +1577,8 @@ trx_print( ibool newline; fprintf(f, "TRANSACTION %lu %lu", - ut_dulint_get_high(trx->id), - ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); switch (trx->conc_state) { case TRX_NOT_STARTED: @@ -1584,20 +1586,20 @@ trx_print( break; case TRX_ACTIVE: fprintf(f, ", ACTIVE %lu sec", - (ulint)difftime(time(NULL), trx->start_time)); + (ulong)difftime(time(NULL), trx->start_time)); break; case TRX_COMMITTED_IN_MEMORY: fputs(", COMMITTED IN MEMORY", f); break; default: - fprintf(f, " state %lu", trx->conc_state); + fprintf(f, " state %lu", (ulong) trx->conc_state); } #ifdef UNIV_LINUX fprintf(f, ", process no %lu", trx->mysql_process_no); #endif fprintf(f, ", OS thread id %lu", - os_thread_pf(trx->mysql_thread_id)); + (ulong) os_thread_pf(trx->mysql_thread_id)); if (*trx->op_info) { putc(' ', f); @@ -1610,18 +1612,18 @@ trx_print( if (trx->declared_to_be_inside_innodb) { fprintf(f, ", thread declared inside InnoDB %lu", - trx->n_tickets_to_enter_innodb); + (ulong) trx->n_tickets_to_enter_innodb); } putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { - if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } - fprintf(f, "mysql tables in use %lu, locked %lu\n", - trx->n_mysql_tables_in_use, - trx->mysql_n_tables_locked); - } - newline = TRUE; switch (trx->que_state) { @@ -1634,7 +1636,7 @@ trx_print( case TRX_QUE_COMMITTING: fputs("COMMITTING ", f); break; default: - fprintf(f, "que state %lu ", trx->que_state); + fprintf(f, "que state %lu ", (ulong) trx->que_state); } if (0 < UT_LIST_GET_LEN(trx->trx_locks) || @@ -1642,8 +1644,8 @@ trx_print( newline = TRUE; fprintf(f, "%lu lock struct(s), heap size %lu", - UT_LIST_GET_LEN(trx->trx_locks), - mem_heap_get_size(trx->lock_heap)); + (ulong) UT_LIST_GET_LEN(trx->trx_locks), + (ulong) mem_heap_get_size(trx->lock_heap)); } if (trx->has_search_latch) { @@ -1654,7 +1656,7 @@ trx_print( if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) { newline = TRUE; fprintf(f, ", undo log entries %lu", - ut_dulint_get_low(trx->undo_no)); + (ulong) ut_dulint_get_low(trx->undo_no)); } if (newline) { diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c index cf8a69c8cca..79566fe01c3 100644 --- a/innobase/trx/trx0undo.c +++ b/innobase/trx/trx0undo.c @@ -387,6 +387,7 @@ trx_undo_seg_create( page_t* undo_page; trx_upagef_t* page_hdr; trx_usegf_t* seg_hdr; + ulint n_reserved; ibool success; ut_ad(mtr && id && rseg_hdr); @@ -410,8 +411,8 @@ trx_undo_seg_create( space = buf_frame_get_space_id(rseg_hdr); - success = fsp_reserve_free_extents(space, 2, FSP_UNDO, mtr); - + success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); if (!success) { return(NULL); @@ -421,7 +422,7 @@ trx_undo_seg_create( undo_page = fseg_create_general(space, 0, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, TRUE, mtr); - fil_space_release_free_extents(space, 2); + fil_space_release_free_extents(space, n_reserved); if (undo_page == NULL) { /* No space left */ @@ -734,6 +735,7 @@ trx_undo_add_page( page_t* new_page; trx_rseg_t* rseg; ulint page_no; + ulint n_reserved; ibool success; #ifdef UNIV_SYNC_DEBUG @@ -751,8 +753,8 @@ trx_undo_add_page( header_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); - success = fsp_reserve_free_extents(undo->space, 1, FSP_UNDO, mtr); - + success = fsp_reserve_free_extents(&n_reserved, undo->space, 1, + FSP_UNDO, mtr); if (!success) { return(FIL_NULL); @@ -763,7 +765,7 @@ trx_undo_add_page( undo->top_page_no + 1, FSP_UP, TRUE, mtr); - fil_space_release_free_extents(undo->space, 1); + fil_space_release_free_extents(undo->space, n_reserved); if (page_no == FIL_NULL) { @@ -1124,7 +1126,7 @@ trx_undo_mem_create_at_db_start( if (id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, - "InnoDB: Error: undo->id is %lu\n", id); + "InnoDB: Error: undo->id is %lu\n", (ulong) id); ut_error; } @@ -1282,7 +1284,7 @@ trx_undo_mem_create( if (id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, - "InnoDB: Error: undo->id is %lu\n", id); + "InnoDB: Error: undo->id is %lu\n", (ulong) id); ut_error; } @@ -1327,7 +1329,8 @@ trx_undo_mem_init_for_reuse( #endif /* UNIV_SYNC_DEBUG */ if (undo->id >= TRX_RSEG_N_SLOTS) { - fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", undo->id); + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); mem_analyze_corruption((byte*)undo); ut_error; @@ -1353,7 +1356,7 @@ trx_undo_mem_free( { if (undo->id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, - "InnoDB: Error: undo->id is %lu\n", undo->id); + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); ut_error; } @@ -1463,7 +1466,8 @@ trx_undo_reuse_cached( ut_ad(undo->size == 1); if (undo->id >= TRX_RSEG_N_SLOTS) { - fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", undo->id); + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); mem_analyze_corruption((byte*)undo); ut_error; } @@ -1599,7 +1603,8 @@ trx_undo_set_state_at_finish( ut_ad(trx && undo && mtr); if (undo->id >= TRX_RSEG_N_SLOTS) { - fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", undo->id); + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); mem_analyze_corruption((byte*)undo); ut_error; } @@ -1665,56 +1670,6 @@ trx_undo_update_cleanup( } } -/************************************************************************** -Discards an undo log and puts the segment to the list of cached update undo -log segments. This optimized function is called if there is no need to keep -the update undo log because there exist no read views and the transaction -made no delete markings, which would make purge necessary. We restrict this -to undo logs of size 1 to make things simpler. */ - -dulint -trx_undo_update_cleanup_by_discard( -/*===============================*/ - /* out: log sequence number at which mtr is - committed */ - trx_t* trx, /* in: trx owning the update undo log */ - mtr_t* mtr) /* in: mtr */ -{ - trx_rseg_t* rseg; - trx_undo_t* undo; - page_t* undo_page; - - undo = trx->update_undo; - rseg = trx->rseg; - -#ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&(rseg->mutex))); - ut_ad(mutex_own(&kernel_mutex)); -#endif /* UNIV_SYNC_DEBUG */ - ut_ad(undo->size == 1); - ut_ad(undo->del_marks == FALSE); - ut_ad(UT_LIST_GET_LEN(trx_sys->view_list) == 1); - - /* NOTE: we must hold the kernel mutex, because we must prevent - creation of new read views before mtr gets committed! */ - - undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); - - trx_undo_discard_latest_update_undo(undo_page, mtr); - - undo->state = TRX_UNDO_CACHED; - - UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo); - - trx->update_undo = NULL; - - UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); - - mtr_commit(mtr); - - return(mtr->end_lsn); -} - /********************************************************************** Frees or caches an insert undo log after a transaction commit or rollback. Knowledge of inserts is not needed after a commit or rollback, therefore diff --git a/innobase/ut/ut0byte.c b/innobase/ut/ut0byte.c index 4ec7e0f405e..8764103dc36 100644 --- a/innobase/ut/ut0byte.c +++ b/innobase/ut/ut0byte.c @@ -18,7 +18,7 @@ Created 5/11/1994 Heikki Tuuri dulint ut_dulint_zero = {0, 0}; /* Maximum value for a dulint */ -dulint ut_dulint_max = {0xFFFFFFFF, 0xFFFFFFFF}; +dulint ut_dulint_max = {0xFFFFFFFFUL, 0xFFFFFFFFUL}; /**************************************************************** Sort function for dulint arrays. */ diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c index 2cab36a9580..9a591df9f77 100644 --- a/innobase/ut/ut0mem.c +++ b/innobase/ut/ut0mem.c @@ -77,8 +77,9 @@ ut_malloc_low( ret = malloc(n + sizeof(ut_mem_block_t)); if (ret == NULL) { + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Fatal error: cannot allocate %lu bytes of\n" + " InnoDB: Fatal error: cannot allocate %lu bytes of\n" "InnoDB: memory with malloc! Total allocated memory\n" "InnoDB: by InnoDB %lu bytes. Operating system errno: %lu\n" "InnoDB: Cannot continue operation!\n" @@ -88,11 +89,11 @@ ut_malloc_low( "InnoDB: a big enough maximum process size.\n" "InnoDB: We now intentionally generate a seg fault so that\n" "InnoDB: on Linux we get a stack trace.\n", - n, ut_total_allocated_memory, + (ulong) n, (ulong) ut_total_allocated_memory, #ifdef __WIN__ - (ulint)GetLastError() + (ulong) GetLastError() #else - (ulint)errno + (ulong) errno #endif ); @@ -141,6 +142,42 @@ ut_malloc( } /************************************************************************** +Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs +out. It cannot be used if we want to return an error message. Prints to +stderr a message if fails. */ + +ibool +ut_test_malloc( +/*===========*/ + /* out: TRUE if succeeded */ + ulint n) /* in: try to allocate this many bytes */ +{ + void* ret; + + ret = malloc(n); + + if (ret == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: cannot allocate %lu bytes of memory for\n" + "InnoDB: a BLOB with malloc! Total allocated memory\n" + "InnoDB: by InnoDB %lu bytes. Operating system errno: %d\n" + "InnoDB: Check if you should increase the swap file or\n" + "InnoDB: ulimits of your operating system.\n" + "InnoDB: On FreeBSD check you have compiled the OS with\n" + "InnoDB: a big enough maximum process size.\n", + (ulong) n, + (ulong) ut_total_allocated_memory, + (int) errno); + return(FALSE); + } + + free(ret); + + return(TRUE); +} + +/************************************************************************** Frees a memory block allocated with ut_malloc. */ void @@ -265,7 +302,7 @@ ut_free_all_mem(void) if (ut_total_allocated_memory != 0) { fprintf(stderr, "InnoDB: Warning: after shutdown total allocated memory is %lu\n", - ut_total_allocated_memory); + (ulong) ut_total_allocated_memory); } } diff --git a/innobase/ut/ut0rnd.c b/innobase/ut/ut0rnd.c index 3335861384f..85d2e6094c3 100644 --- a/innobase/ut/ut0rnd.c +++ b/innobase/ut/ut0rnd.c @@ -71,9 +71,8 @@ ut_find_prime( /* Found a prime */ break; - next_n: ; +next_n: ; } return(n); } - diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index 1be4cc0e0d8..39850227162 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -30,29 +30,6 @@ mysql_get_identifier_quote_char(void); used in SQL identifiers */ /************************************************************ -Uses vsprintf to emulate sprintf so that the function always returns -the printed length. Apparently in some old SCO Unixes sprintf did not -return the printed length but a pointer to the end of the printed string. */ - -ulint -ut_sprintf( -/*=======*/ - char* buf, /* in/out: buffer where to print */ - const char* format, /* in: format of prints */ - ...) /* in: arguments to be printed */ -{ - va_list args; - - va_start(args, format); - - vsprintf(buf, format, args); - - va_end(args); - - return((ulint)strlen(buf)); -} - -/************************************************************ Gets the high 32 bits in a ulint. That is makes a shift >> 32, but since there seem to be compiler bugs in both gcc and Visual C++, we do this by a special conversion. */ @@ -73,7 +50,7 @@ ut_get_high32( } /************************************************************ -The following function returns a clock time in milliseconds. */ +The following function returns elapsed CPU time in milliseconds. */ ulint ut_clock(void) @@ -192,6 +169,50 @@ ut_sprintf_timestamp( } /************************************************************** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ + +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf) /* in: buffer where to sprintf */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d", + (int)cal_tm.wYear % 100, + (int)cal_tm.wMonth, + (int)cal_tm.wDay, + (int)cal_tm.wHour, + (int)cal_tm.wMinute, + (int)cal_tm.wSecond); +#else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; + + time(&tm); + +#ifdef HAVE_LOCALTIME_R + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + cal_tm_ptr = localtime(&tm); +#endif + sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d", + cal_tm_ptr->tm_year % 100, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec); +#endif +} + +/************************************************************** Returns current year, month, day. */ void @@ -264,7 +285,7 @@ ut_print_buf( fprintf(file, " len %lu; hex ", len); for (data = buf, i = 0; i < len; i++) { - fprintf(file, "%02lx", (ulint)*data++); + fprintf(file, "%02lx", (ulong)*data++); } fputs("; asc ", file); |